257 lines
41 KiB
Plaintext
257 lines
41 KiB
Plaintext
[38;5;12m (https://spark.apache.org/)[39m
|
||
|
||
[38;5;12m [39m[38;2;255;187;0m[1m[4mAwesome Spark [0m[38;5;14m[1m[4m![0m[38;2;255;187;0m[1m[4mAwesome[0m[38;5;14m[1m[4m (https://cdn.rawgit.com/sindresorhus/awesome/d7305f38d29fed78fa85652e3a63e154dd8e8829/media/badge.svg)[0m[38;2;255;187;0m[1m[4m (https://github.com/sindresorhus/awesome)[0m
|
||
|
||
[38;5;12mA curated list of awesome [39m[38;5;14m[1mApache Spark[0m[38;5;12m (https://spark.apache.org/) packages and resources.[39m
|
||
|
||
[38;5;12m_Apache[39m[38;5;12m [39m[38;5;12mSpark[39m[38;5;12m [39m[38;5;12mis[39m[38;5;12m [39m[38;5;12man[39m[38;5;12m [39m[38;5;12mopen-source[39m[38;5;12m [39m[38;5;12mcluster-computing[39m[38;5;12m [39m[38;5;12mframework.[39m[38;5;12m [39m[38;5;12mOriginally[39m[38;5;12m [39m[38;5;12mdeveloped[39m[38;5;12m [39m[38;5;12mat[39m[38;5;12m [39m[38;5;12mthe[39m[38;5;12m [39m[38;5;14m[1mUniversity[0m[38;5;14m[1m [0m[38;5;14m[1mof[0m[38;5;14m[1m [0m[38;5;14m[1mCalifornia[0m[38;5;12m [39m[38;5;12m(https://www.universityofcalifornia.edu/),[39m[38;5;12m [39m[38;5;14m[1mBerkeley's[0m[38;5;14m[1m [0m[38;5;14m[1mAMPLab[0m[38;5;12m [39m[38;5;12m(https://amplab.cs.berkeley.edu/),[39m[38;5;12m [39m[38;5;12mthe[39m[38;5;12m [39m[38;5;12mSpark[39m[38;5;12m [39m[38;5;12mcodebase[39m[38;5;12m [39m[38;5;12mwas[39m[38;5;12m [39m
|
||
[38;5;12mlater[39m[38;5;12m [39m[38;5;12mdonated[39m[38;5;12m [39m[38;5;12mto[39m[38;5;12m [39m[38;5;12mthe[39m[38;5;12m [39m[38;5;14m[1mApache[0m[38;5;14m[1m [0m[38;5;14m[1mSoftware[0m[38;5;14m[1m [0m[38;5;14m[1mFoundation[0m[38;5;12m [39m[38;5;12m(https://www.apache.org/),[39m[38;5;12m [39m[38;5;12mwhich[39m[38;5;12m [39m[38;5;12mhas[39m[38;5;12m [39m[38;5;12mmaintained[39m[38;5;12m [39m[38;5;12mit[39m[38;5;12m [39m[38;5;12msince.[39m[38;5;12m [39m[38;5;12mSpark[39m[38;5;12m [39m[38;5;12mprovides[39m[38;5;12m [39m[38;5;12man[39m[38;5;12m [39m[38;5;12minterface[39m[38;5;12m [39m[38;5;12mfor[39m[38;5;12m [39m[38;5;12mprogramming[39m[38;5;12m [39m[38;5;12mentire[39m[38;5;12m [39m[38;5;12mclusters[39m[38;5;12m [39m[38;5;12mwith[39m[38;5;12m [39m[38;5;12mimplicit[39m[38;5;12m [39m[38;5;12mdata[39m[38;5;12m [39m[38;5;12mparallelism[39m[38;5;12m [39m[38;5;12mand[39m[38;5;12m [39m[38;5;12mfault-tolerance_[39m[38;5;12m [39m[38;5;12m([39m[38;5;14m[1mWikipedia[0m[38;5;14m[1m [0m[38;5;14m[1m2017[0m[38;5;12m [39m
|
||
[38;5;12m(#wikipedia-2017)).[39m
|
||
|
||
[38;5;12mUsers of Apache Spark may choose between different the Python, R, Scala and Java programming languages to interface with the Apache Spark APIs.[39m
|
||
|
||
[38;2;255;187;0m[4mContents[0m
|
||
|
||
[38;5;12m- [39m[38;5;14m[1mPackages[0m[38;5;12m (#packages)[39m
|
||
[38;5;12m - [39m[38;5;14m[1mLanguage Bindings[0m[38;5;12m (#language-bindings)[39m
|
||
[38;5;12m - [39m[38;5;14m[1mNotebooks and IDEs[0m[38;5;12m (#notebooks-and-ides)[39m
|
||
[38;5;12m - [39m[38;5;14m[1mGeneral Purpose Libraries[0m[38;5;12m (#general-purpose-libraries)[39m
|
||
[38;5;12m - [39m[38;5;14m[1mSQL Data Sources[0m[38;5;12m (#sql-data-sources)[39m
|
||
[38;5;12m - [39m[38;5;14m[1mStorage[0m[38;5;12m (#storage)[39m
|
||
[38;5;12m - [39m[38;5;14m[1mBioinformatics[0m[38;5;12m (#bioinformatics)[39m
|
||
[38;5;12m - [39m[38;5;14m[1mGIS[0m[38;5;12m (#gis)[39m
|
||
[38;5;12m - [39m[38;5;14m[1mTime Series Analytics[0m[38;5;12m (#time-series-analytics)[39m
|
||
[38;5;12m - [39m[38;5;14m[1mGraph Processing[0m[38;5;12m (#graph-processing)[39m
|
||
[38;5;12m - [39m[38;5;14m[1mMachine Learning Extension[0m[38;5;12m (#machine-learning-extension)[39m
|
||
[38;5;12m - [39m[38;5;14m[1mMiddleware[0m[38;5;12m (#middleware)[39m
|
||
[38;5;12m - [39m[38;5;14m[1mUtilities[0m[38;5;12m (#utilities)[39m
|
||
[38;5;12m - [39m[38;5;14m[1mNatural Language Processing[0m[38;5;12m (#natural-language-processing)[39m
|
||
[38;5;12m - [39m[38;5;14m[1mStreaming[0m[38;5;12m (#streaming)[39m
|
||
[38;5;12m - [39m[38;5;14m[1mInterfaces[0m[38;5;12m (#interfaces)[39m
|
||
[38;5;12m - [39m[38;5;14m[1mTesting[0m[38;5;12m (#testing)[39m
|
||
[38;5;12m - [39m[38;5;14m[1mWeb Archives[0m[38;5;12m (#web-archives)[39m
|
||
[38;5;12m - [39m[38;5;14m[1mWorkflow Management[0m[38;5;12m (#workflow-management)[39m
|
||
|
||
[38;5;12m- [39m[38;5;14m[1mResources[0m[38;5;12m (#resources)[39m
|
||
[38;5;12m - [39m[38;5;14m[1mBooks[0m[38;5;12m (#books)[39m
|
||
[38;5;12m - [39m[38;5;14m[1mPapers[0m[38;5;12m (#papers)[39m
|
||
[38;5;12m - [39m[38;5;14m[1mMOOCS[0m[38;5;12m (#moocs)[39m
|
||
[38;5;12m - [39m[38;5;14m[1mWorkshops[0m[38;5;12m (#workshops)[39m
|
||
[38;5;12m - [39m[38;5;14m[1mProjects Using Spark[0m[38;5;12m (#projects-using-spark)[39m
|
||
[38;5;12m - [39m[38;5;14m[1mDocker Images[0m[38;5;12m (#docker-images)[39m
|
||
[38;5;12m - [39m[38;5;14m[1mMiscellaneous[0m[38;5;12m (#miscellaneous)[39m
|
||
|
||
|
||
[38;2;255;187;0m[4mPackages[0m
|
||
|
||
[38;2;255;187;0m[4mLanguage Bindings[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mKotlin for Apache Spark[0m[38;5;12m (https://github.com/Kotlin/kotlin-spark-api) - Kotlin API bindings and extensions.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mFlambo[0m[38;5;12m (https://github.com/yieldbot/flambo) - Clojure DSL.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mMobius[0m[38;5;12m (https://github.com/Microsoft/Mobius) - C# bindings (Deprecated in favor of .NET for Apache Spark).[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1m.NET for Apache Spark[0m[38;5;12m (https://github.com/dotnet/spark) - .NET bindings.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1msparklyr[0m[38;5;12m (https://github.com/rstudio/sparklyr) - An alternative R backend, using [39m[48;5;235m[38;5;249m[1mdplyr[0m[38;5;12m (https://github.com/hadley/dplyr).[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1msparkle[0m[38;5;12m (https://github.com/tweag/sparkle) - Haskell on Apache Spark.[39m
|
||
|
||
[38;2;255;187;0m[4mNotebooks and IDEs[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1malmond[0m[38;5;12m (https://almond.sh/) - A scala kernel for [39m[38;5;14m[1mJupyter[0m[38;5;12m (https://jupyter.org/).[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mApache Zeppelin[0m[38;5;12m (https://zeppelin.incubator.apache.org/) - Web-based notebook that enables interactive data analytics with plugable backends, integrated plotting, and extensive Spark support out-of-the-box.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mPolynote[0m[38;5;12m [39m[38;5;12m(https://polynote.org/)[39m[38;5;12m [39m[38;5;12m-[39m[38;5;12m [39m[38;5;12mPolynote:[39m[38;5;12m [39m[38;5;12man[39m[38;5;12m [39m[38;5;12mIDE-inspired[39m[38;5;12m [39m[38;5;12mpolyglot[39m[38;5;12m [39m[38;5;12mnotebook.[39m[38;5;12m [39m[38;5;12mIt[39m[38;5;12m [39m[38;5;12msupports[39m[38;5;12m [39m[38;5;12mmixing[39m[38;5;12m [39m[38;5;12mmultiple[39m[38;5;12m [39m[38;5;12mlanguages[39m[38;5;12m [39m[38;5;12min[39m[38;5;12m [39m[38;5;12mone[39m[38;5;12m [39m[38;5;12mnotebook,[39m[38;5;12m [39m[38;5;12mand[39m[38;5;12m [39m[38;5;12msharing[39m[38;5;12m [39m[38;5;12mdata[39m[38;5;12m [39m[38;5;12mbetween[39m[38;5;12m [39m[38;5;12mthem[39m[38;5;12m [39m[38;5;12mseamlessly.[39m[38;5;12m [39m[38;5;12mIt[39m[38;5;12m [39m[38;5;12mencourages[39m[38;5;12m [39m[38;5;12mreproducible[39m[38;5;12m [39m[38;5;12mnotebooks[39m[38;5;12m [39m[38;5;12mwith[39m[38;5;12m [39m[38;5;12mits[39m[38;5;12m [39m[38;5;12mimmutable[39m
|
||
[38;5;12mdata[39m[38;5;12m [39m[38;5;12mmodel.[39m[38;5;12m [39m[38;5;12mOriginating[39m[38;5;12m [39m[38;5;12mfrom[39m[38;5;12m [39m[38;5;14m[1mNetflix[0m[38;5;12m [39m[38;5;12m(https://medium.com/netflix-techblog/open-sourcing-polynote-an-ide-inspired-polyglot-notebook-7f929d3f447).[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mSpark Notebook[0m[38;5;12m (https://github.com/andypetrella/spark-notebook) - Scalable and stable Scala and Spark focused notebook bridging the gap between JVM and Data Scientists (incl. extendable, typesafe and reactive charts).[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1msparkmagic[0m[38;5;12m [39m[38;5;12m(https://github.com/jupyter-incubator/sparkmagic)[39m[38;5;12m [39m[38;5;12m-[39m[38;5;12m [39m[38;5;14m[1mJupyter[0m[38;5;12m [39m[38;5;12m(https://jupyter.org/)[39m[38;5;12m [39m[38;5;12mmagics[39m[38;5;12m [39m[38;5;12mand[39m[38;5;12m [39m[38;5;12mkernels[39m[38;5;12m [39m[38;5;12mfor[39m[38;5;12m [39m[38;5;12mworking[39m[38;5;12m [39m[38;5;12mwith[39m[38;5;12m [39m[38;5;12mremote[39m[38;5;12m [39m[38;5;12mSpark[39m[38;5;12m [39m[38;5;12mclusters,[39m[38;5;12m [39m[38;5;12mfor[39m[38;5;12m [39m[38;5;12minteractively[39m[38;5;12m [39m[38;5;12mworking[39m[38;5;12m [39m[38;5;12mwith[39m[38;5;12m [39m[38;5;12mremote[39m[38;5;12m [39m[38;5;12mSpark[39m[38;5;12m [39m[38;5;12mclusters[39m[38;5;12m [39m[38;5;12mthrough[39m[38;5;12m [39m[38;5;14m[1mLivy[0m[38;5;12m [39m
|
||
[38;5;12m(https://github.com/cloudera/livy),[39m[38;5;12m [39m[38;5;12min[39m[38;5;12m [39m[38;5;12mJupyter[39m[38;5;12m [39m[38;5;12mnotebooks.[39m
|
||
|
||
[38;2;255;187;0m[4mGeneral Purpose Libraries[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mSuccinct[0m[38;5;12m (http://succinct.cs.berkeley.edu/) - Support for efficient queries on compressed data.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mitachi[0m[38;5;12m (https://github.com/yaooqinn/itachi) - A library that brings useful functions from modern database management systems to Apache Spark.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mspark-daria[0m[38;5;12m (https://github.com/mrpowers/spark-daria) - A Scala library with essential Spark functions and extensions to make you more productive.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mquinn[0m[38;5;12m (https://github.com/mrpowers/quinn) - A native PySpark implementation of spark-daria.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mApache DataFu[0m[38;5;12m (https://github.com/apache/datafu/tree/master/datafu-spark) - A library of general purpose functions and UDF's.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mJoblib Apache Spark Backend[0m[38;5;12m (https://github.com/joblib/joblib-spark) - [39m[48;5;235m[38;5;249m[1mjoblib[0m[38;5;12m (https://github.com/joblib/joblib) backend for running tasks on Spark clusters.[39m
|
||
|
||
[38;2;255;187;0m[4mSQL Data Sources[0m
|
||
|
||
[38;5;12mSparkSQL[39m[38;5;12m [39m[38;5;12mhas[39m[38;5;12m [39m[38;5;14m[1mserveral[0m[38;5;14m[1m [0m[38;5;14m[1mbuilt-in[0m[38;5;14m[1m [0m[38;5;14m[1mData[0m[38;5;14m[1m [0m[38;5;14m[1mSources[0m[38;5;12m [39m[38;5;12m(https://spark.apache.org/docs/latest/sql-data-sources-load-save-functions.html#manually-specifying-options)[39m[38;5;12m [39m[38;5;12mfor[39m[38;5;12m [39m[38;5;12mfiles.[39m[38;5;12m [39m[38;5;12mThese[39m[38;5;12m [39m[38;5;12minclude[39m[38;5;12m [39m[48;5;235m[38;5;249mcsv[49m[39m[38;5;12m,[39m[38;5;12m [39m[48;5;235m[38;5;249mjson[49m[39m[38;5;12m,[39m[38;5;12m [39m[48;5;235m[38;5;249mparquet[49m[39m[38;5;12m,[39m[38;5;12m [39m[48;5;235m[38;5;249morc[49m[39m[38;5;12m,[39m[38;5;12m [39m[38;5;12mand[39m[38;5;12m [39m[48;5;235m[38;5;249mavro[49m[39m[38;5;12m.[39m[38;5;12m [39m[38;5;12mIt[39m[38;5;12m [39m[38;5;12malso[39m[38;5;12m [39m[38;5;12msupports[39m[38;5;12m [39m[38;5;12mJDBC[39m[38;5;12m [39m
|
||
[38;5;12mdatabases[39m[38;5;12m [39m[38;5;12mas[39m[38;5;12m [39m[38;5;12mwell[39m[38;5;12m [39m[38;5;12mas[39m[38;5;12m [39m[38;5;12mApache[39m[38;5;12m [39m[38;5;12mHive.[39m[38;5;12m [39m[38;5;12mAdditional[39m[38;5;12m [39m[38;5;12mdata[39m[38;5;12m [39m[38;5;12msources[39m[38;5;12m [39m[38;5;12mcan[39m[38;5;12m [39m[38;5;12mbe[39m[38;5;12m [39m[38;5;12madded[39m[38;5;12m [39m[38;5;12mby[39m[38;5;12m [39m[38;5;12mincluding[39m[38;5;12m [39m[38;5;12mthe[39m[38;5;12m [39m[38;5;12mpackages[39m[38;5;12m [39m[38;5;12mlisted[39m[38;5;12m [39m[38;5;12mbelow,[39m[38;5;12m [39m[38;5;12mor[39m[38;5;12m [39m[38;5;12mwriting[39m[38;5;12m [39m[38;5;12myour[39m[38;5;12m [39m[38;5;12mown.[39m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mSpark CSV[0m[38;5;12m (https://github.com/databricks/spark-csv) - CSV reader and writer (obsolete since Spark 2.0 [39m[38;5;12mSPARK-12833[39m[38;5;14m[1m [0m[38;5;12m (https://issues.apache.org/jira/browse/SPARK-12833)).[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mSpark Avro[0m[38;5;12m (https://github.com/databricks/spark-avro) - [39m[38;5;14m[1mApache Avro[0m[38;5;12m (https://avro.apache.org/) reader and writer (obselete since Spark 2.4 [39m[38;5;12mSPARK-24768[39m[38;5;14m[1m [0m[38;5;12m (https://issues.apache.org/jira/browse/SPARK-24768)).[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mSpark XML[0m[38;5;12m (https://github.com/databricks/spark-xml) - XML parser and writer.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mSpark Cassandra Connector[0m[38;5;12m (https://github.com/datastax/spark-cassandra-connector) - Cassandra support including data source and API and support for arbitrary queries.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mSpark Riak Connector[0m[38;5;12m (https://github.com/basho/spark-riak-connector) - Riak TS & Riak KV connector.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mMongo-Spark[0m[38;5;12m (https://github.com/mongodb/mongo-spark) - Official MongoDB connector.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mOrientDB-Spark[0m[38;5;12m (https://github.com/orientechnologies/spark-orientdb) - Official OrientDB connector.[39m
|
||
|
||
[38;2;255;187;0m[4mStorage[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mDelta Lake[0m[38;5;12m (https://github.com/delta-io/delta) - Storage layer with ACID transactions.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mlakeFS[0m[38;5;12m (https://docs.lakefs.io/integrations/spark.html) - Integration with the lakeFS atomic versioned storage layer.[39m
|
||
[38;2;255;187;0m[4mBioinformatics[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mADAM[0m[38;5;12m (https://github.com/bigdatagenomics/adam) - Set of tools designed to analyse genomics data.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mHail[0m[38;5;12m (https://github.com/hail-is/hail) - Genetic analysis framework.[39m
|
||
|
||
[38;2;255;187;0m[4mGIS[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mMagellan[0m[38;5;12m (https://github.com/harsha2010/magellan) - Geospatial analytics using Spark.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mApache Sedona[0m[38;5;12m (https://github.com/apache/incubator-sedona) - Cluster computing system for processing large-scale spatial data.[39m
|
||
|
||
[38;2;255;187;0m[4mTime Series Analytics[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mSpark-Timeseries[0m[38;5;12m (https://github.com/cloudera/spark-timeseries) - Scala / Java / Python library for interacting with time series data on Apache Spark.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mflint[0m[38;5;12m (https://github.com/twosigma/flint) - A time series library for Apache Spark.[39m
|
||
|
||
[38;2;255;187;0m[4mGraph Processing[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mMazerunner[0m[38;5;12m (https://github.com/neo4j-contrib/neo4j-mazerunner) - Graph analytics platform on top of Neo4j and GraphX.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mGraphFrames[0m[38;5;12m (https://github.com/graphframes/graphframes) - Data frame based graph API.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mneo4j-spark-connector[0m[38;5;12m (https://github.com/neo4j-contrib/neo4j-spark-connector) - Bolt protocol based, Neo4j Connector with RDD, DataFrame and GraphX / GraphFrames support.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mSparklingGraph[0m[38;5;12m (http://sparkling.ml) - Library extending GraphX features with multiple functionalities useful in graph analytics (measures, generators, link prediction etc.).[39m
|
||
|
||
[38;2;255;187;0m[4mMachine Learning Extension[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mClustering4Ever[0m[38;5;12m (https://github.com/Clustering4Ever/Clustering4Ever) Scala and Spark API to benchmark and analyse clustering algorithms on any vectorization you can generate.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mdbscan-on-spark[0m[38;5;12m [39m[38;5;12m(https://github.com/irvingc/dbscan-on-spark)[39m[38;5;12m [39m[38;5;12m-[39m[38;5;12m [39m[38;5;12mAn[39m[38;5;12m [39m[38;5;12mImplementation[39m[38;5;12m [39m[38;5;12mof[39m[38;5;12m [39m[38;5;12mthe[39m[38;5;12m [39m[38;5;12mDBSCAN[39m[38;5;12m [39m[38;5;12mclustering[39m[38;5;12m [39m[38;5;12malgorithm[39m[38;5;12m [39m[38;5;12mon[39m[38;5;12m [39m[38;5;12mtop[39m[38;5;12m [39m[38;5;12mof[39m[38;5;12m [39m[38;5;12mApache[39m[38;5;12m [39m[38;5;12mSpark[39m[38;5;12m [39m[38;5;12mby[39m[38;5;12m [39m[38;5;14m[1mirvingc[0m[38;5;12m [39m[38;5;12m(https://github.com/irvingc)[39m[38;5;12m [39m[38;5;12mand[39m[38;5;12m [39m[38;5;12mbased[39m[38;5;12m [39m[38;5;12mon[39m[38;5;12m [39m[38;5;12mthe[39m[38;5;12m [39m[38;5;12mpaper[39m[38;5;12m [39m[38;5;12mfrom[39m[38;5;12m [39m[38;5;12mHe,[39m[38;5;12m [39m[38;5;12mYaobin,[39m[38;5;12m [39m[38;5;12met[39m[38;5;12m [39m[38;5;12mal.[39m[38;5;12m [39m
|
||
[38;5;14m[1mMR-DBSCAN:[0m[38;5;14m[1m [0m[38;5;14m[1ma[0m[38;5;14m[1m [0m[38;5;14m[1mscalable[0m[38;5;14m[1m [0m[38;5;14m[1mMapReduce-based[0m[38;5;14m[1m [0m[38;5;14m[1mDBSCAN[0m[38;5;14m[1m [0m[38;5;14m[1malgorithm[0m[38;5;14m[1m [0m[38;5;14m[1mfor[0m[38;5;14m[1m [0m[38;5;14m[1mheavily[0m[38;5;14m[1m [0m[38;5;14m[1mskewed[0m[38;5;14m[1m [0m[38;5;14m[1mdata[0m[38;5;12m [39m
|
||
[38;5;12m(https://www.researchgate.net/profile/Yaobin_He/publication/260523383_MR-DBSCAN_a_scalable_MapReduce-based_DBSCAN_algorithm_for_heavily_skewed_data/links/0046353a1763ee2bdf000000.pdf).[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mApache SystemML[0m[38;5;12m (https://systemml.apache.org/) - Declarative machine learning framework on top of Spark.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mMahout Spark Bindings[0m[38;5;12m (https://mahout.apache.org/users/sparkbindings/home.html) [39m[38;5;12m*[39m[48;2;30;30;40m[38;5;13m[3mstatus unknown[0m[48;2;30;30;40m[38;5;13m[3m*[0m[38;5;12m - linear algebra DSL and optimizer with R-like syntax.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mspark-sklearn[0m[38;5;12m (https://github.com/databricks/spark-sklearn) - Scikit-learn integration with distributed model training.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mKeystoneML[0m[38;5;12m (http://keystone-ml.org/) - Type safe machine learning pipelines with RDDs.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mJPMML-Spark[0m[38;5;12m (https://github.com/jpmml/jpmml-spark) - PMML transformer library for Spark ML.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mDistributed Keras[0m[38;5;12m (https://github.com/cerndb/dist-keras) - Distributed deep learning framework with PySpark and Keras.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mModelDB[0m[38;5;12m (https://mitdbg.github.io/modeldb) .[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mSparkling Water[0m[38;5;12m (https://github.com/h2oai/sparkling-water) - [39m[38;5;14m[1mH2O[0m[38;5;12m (http://www.h2o.ai/) interoperability layer.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mBigDL[0m[38;5;12m (https://github.com/intel-analytics/BigDL) - Distributed Deep Learning library.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mMLeap[0m[38;5;12m (https://github.com/combust/mleap) - Execution engine and serialization format which supports deployment of [39m[48;5;235m[38;5;249mo.a.s.ml[49m[39m[38;5;12m models without dependency on [39m[48;5;235m[38;5;249mSparkSession[49m[39m[38;5;12m.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mMicrosoft ML for Apache Spark[0m[38;5;12m (https://github.com/Azure/mmlspark) - A distributed ml library with support for LightGBM, Vowpal Wabbit, OpenCV, Deep Learning, Cognitive Services, and Model Deployment.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mMLflow[0m[38;5;12m (https://mlflow.org/docs/latest/python_api/mlflow.spark.html#module-mlflow.spark) - Machine learning orchestration platform. [39m
|
||
|
||
[38;2;255;187;0m[4mMiddleware[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mLivy[0m[38;5;12m (https://github.com/apache/incubator-livy) - REST server with extensive language support (Python, R, Scala), ability to maintain interactive sessions and object sharing.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mspark-jobserver[0m[38;5;12m (https://github.com/spark-jobserver/spark-jobserver) - Simple Spark as a Service which supports objects sharing using so called named objects. JVM only.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mMist[0m[38;5;12m (https://github.com/Hydrospheredata/mist) - Service for exposing Spark analytical jobs and machine learning models as realtime, batch or reactive web services.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mApache Toree[0m[38;5;12m (https://github.com/apache/incubator-toree) - IPython protocol based middleware for interactive applications.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mApache Kyuubi[0m[38;5;12m (https://github.com/apache/kyuubi) - A distributed multi-tenant JDBC server for large-scale data processing and analytics, built on top of Apache Spark.[39m
|
||
|
||
[38;2;255;187;0m[4mMonitoring[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mData Mechanics Delight[0m[38;5;12m (https://github.com/datamechanics/delight) - Cross-platform monitoring tool (Spark UI / Spark History Server replacement).[39m
|
||
|
||
[38;2;255;187;0m[4mUtilities[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1msilex[0m[38;5;12m (https://github.com/willb/silex) - Collection of tools varying from ML extensions to additional RDD methods.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1msparkly[0m[38;5;12m (https://github.com/Tubular/sparkly) - Helpers & syntactic sugar for PySpark.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mpyspark-stubs[0m[38;5;12m (https://github.com/zero323/pyspark-stubs) - Static type annotations for PySpark (obsolete since Spark 3.1. See [39m[38;5;14m[1mSPARK-32681[0m[38;5;12m (https://issues.apache.org/jira/browse/SPARK-32681)).[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mFlintrock[0m[38;5;12m (https://github.com/nchammas/flintrock) - A command-line tool for launching Spark clusters on EC2.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mOptimus[0m[38;5;12m (https://github.com/ironmussa/Optimus/) - Data Cleansing and Exploration utilities with the goal of simplifying data cleaning.[39m
|
||
|
||
[38;2;255;187;0m[4mNatural Language Processing[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mspark-corenlp[0m[38;5;12m (https://github.com/databricks/spark-corenlp) - DataFrame wrapper for [39m[38;5;14m[1mStanford CoreNLP[0m[38;5;12m (https://stanfordnlp.github.io/CoreNLP/).[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mspark-nlp[0m[38;5;12m (https://github.com/JohnSnowLabs/spark-nlp) - Natural language processing library built on top of Apache Spark ML.[39m
|
||
|
||
[38;2;255;187;0m[4mStreaming[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mApache Bahir[0m[38;5;12m (https://bahir.apache.org/) - Collection of the streaming connectors excluded from Spark 2.0 (Akka, MQTT, Twitter. ZeroMQ).[39m
|
||
|
||
[38;2;255;187;0m[4mInterfaces[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mApache Beam[0m[38;5;12m (https://beam.apache.org/) - Unified data processing engine supporting both batch and streaming applications. Apache Spark is one of the supported execution environments.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mBlaze[0m[38;5;12m (https://github.com/blaze/blaze) - Interface for querying larger than memory datasets using Pandas-like syntax. It supports both Spark [39m[48;5;235m[38;5;249mDataFrames[49m[39m[38;5;12m and [39m[48;5;235m[38;5;249mRDDs[49m[39m[38;5;12m.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mKoalas[0m[38;5;12m (https://github.com/databricks/koalas) - Pandas DataFrame API on top of Apache Spark.[39m
|
||
|
||
[38;2;255;187;0m[4mTesting[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mdeequ[0m[38;5;12m (https://github.com/awslabs/deequ) - Deequ is a library built on top of Apache Spark for defining "unit tests for data", which measure data quality in large datasets.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mspark-testing-base[0m[38;5;12m (https://github.com/holdenk/spark-testing-base) - Collection of base test classes.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mspark-fast-tests[0m[38;5;12m (https://github.com/MrPowers/spark-fast-tests) - A lightweight and fast testing framework.[39m
|
||
|
||
[38;2;255;187;0m[4mWeb Archives[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mArchives Unleashed Toolkit[0m[38;5;12m (https://github.com/archivesunleashed/aut) - Open-source toolkit for analyzing web archives.[39m
|
||
|
||
[38;2;255;187;0m[4mWorkflow Management[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mCromwell[0m[38;5;12m (https://github.com/broadinstitute/cromwell#spark-backend) - Workflow management system with [39m[38;5;14m[1mSpark backend[0m[38;5;12m (https://github.com/broadinstitute/cromwell#spark-backend).[39m
|
||
|
||
[38;2;255;187;0m[4mResources[0m
|
||
|
||
[38;2;255;187;0m[4mBooks[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mLearning Spark, 2nd Edition[0m[38;5;12m (https://www.oreilly.com/library/view/learning-spark-2nd/9781492050032/) - Introduction to Spark API with Spark 3.0 covered. Good source of knowledge about basic concepts.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mAdvanced Analytics with Spark[0m[38;5;12m (http://shop.oreilly.com/product/0636920035091.do) - Useful collection of Spark processing patterns. Accompanying GitHub repository: [39m[38;5;14m[1msryza/aas[0m[38;5;12m (https://github.com/sryza/aas).[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mMastering Apache Spark[0m[38;5;12m (https://jaceklaskowski.gitbooks.io/mastering-apache-spark/) - Interesting compilation of notes by [39m[38;5;14m[1mJacek Laskowski[0m[38;5;12m (https://github.com/jaceklaskowski). Focused on different aspects of Spark internals.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mSpark Gotchas[0m[38;5;12m (https://github.com/awesome-spark/spark-gotchas) - Subjective compilation of tips, tricks and common programming mistakes.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mSpark[0m[38;5;14m[1m [0m[38;5;14m[1min[0m[38;5;14m[1m [0m[38;5;14m[1mAction[0m[38;5;12m [39m[38;5;12m(https://www.manning.com/books/spark-in-action)[39m[38;5;12m [39m[38;5;12m-[39m[38;5;12m [39m[38;5;12mNew[39m[38;5;12m [39m[38;5;12mbook[39m[38;5;12m [39m[38;5;12min[39m[38;5;12m [39m[38;5;12mthe[39m[38;5;12m [39m[38;5;12mManning's[39m[38;5;12m [39m[38;5;12m"in[39m[38;5;12m [39m[38;5;12maction"[39m[38;5;12m [39m[38;5;12mfamily[39m[38;5;12m [39m[38;5;12mwith[39m[38;5;12m [39m[38;5;12m+400[39m[38;5;12m [39m[38;5;12mpages.[39m[38;5;12m [39m[38;5;12mStarts[39m[38;5;12m [39m[38;5;12mgently,[39m[38;5;12m [39m[38;5;12mstep-by-step[39m[38;5;12m [39m[38;5;12mand[39m[38;5;12m [39m[38;5;12mcovers[39m[38;5;12m [39m[38;5;12mlarge[39m[38;5;12m [39m[38;5;12mnumber[39m[38;5;12m [39m[38;5;12mof[39m[38;5;12m [39m[38;5;12mtopics.[39m[38;5;12m [39m[38;5;12mFree[39m[38;5;12m [39m[38;5;12mexcerpt[39m[38;5;12m [39m[38;5;12mon[39m[38;5;12m [39m[38;5;12mhow[39m[38;5;12m [39m[38;5;12mto[39m[38;5;12m [39m[38;5;14m[1msetup[0m[38;5;14m[1m [0m[38;5;14m[1mEclipse[0m[38;5;14m[1m [0m[38;5;14m[1mfor[0m[38;5;14m[1m [0m
|
||
[38;5;14m[1mSpark[0m[38;5;14m[1m [0m[38;5;14m[1mapplication[0m[38;5;14m[1m [0m[38;5;14m[1mdevelopment[0m[38;5;12m [39m[38;5;12m(http://freecontent.manning.com/how-to-start-developing-spark-applications-in-eclipse/)[39m[38;5;12m [39m[38;5;12mand[39m[38;5;12m [39m[38;5;12mhow[39m[38;5;12m [39m[38;5;12mto[39m[38;5;12m [39m[38;5;12mbootstrap[39m[38;5;12m [39m[38;5;12ma[39m[38;5;12m [39m[38;5;12mnew[39m[38;5;12m [39m[38;5;12mapplication[39m[38;5;12m [39m[38;5;12musing[39m[38;5;12m [39m[38;5;12mthe[39m[38;5;12m [39m[38;5;12mprovided[39m[38;5;12m [39m[38;5;12mMaven[39m[38;5;12m [39m[38;5;12mArchetype.[39m[38;5;12m [39m[38;5;12mYou[39m[38;5;12m [39m[38;5;12mcan[39m[38;5;12m [39m[38;5;12mfind[39m[38;5;12m [39m[38;5;12mthe[39m[38;5;12m [39m[38;5;12maccompanying[39m[38;5;12m [39m[38;5;12mGitHub[39m[38;5;12m [39m[38;5;12mrepo[39m[38;5;12m [39m
|
||
[38;5;14m[1mhere[0m[38;5;12m [39m[38;5;12m(https://github.com/spark-in-action/first-edition).[39m
|
||
|
||
[38;2;255;187;0m[4mPapers[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mLarge-Scale Intelligent Microservices[0m
|
||
[38;5;12m (https://arxiv.org/pdf/2009.08044.pdf) - Microsoft paper that presents an Apache Spark-based micro-service orchestration framework that extends database operations to include web service primitives.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mResilient Distributed Datasets: A Fault-Tolerant Abstraction for In-Memory Cluster Computing[0m[38;5;12m (https://people.csail.mit.edu/matei/papers/2012/nsdi_spark.pdf) - Paper introducing a core distributed memory abstraction.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mSpark SQL: Relational Data Processing in Spark[0m[38;5;12m (https://amplab.cs.berkeley.edu/wp-content/uploads/2015/03/SparkSQLSigmod2015.pdf) - Paper introducing relational underpinnings, code generation and Catalyst optimizer.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mStructured Streaming: A Declarative API for Real-Time Applications in Apache Spark[0m
|
||
[38;5;12m (https://cs.stanford.edu/~matei/papers/2018/sigmod_structured_streaming.pdf) - Structured Streaming is a new high-level streaming API, it is a declarative API based on automatically incrementalizing a static relational query.[39m
|
||
|
||
[38;2;255;187;0m[4mMOOCS[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mData[0m[38;5;14m[1m [0m[38;5;14m[1mScience[0m[38;5;14m[1m [0m[38;5;14m[1mand[0m[38;5;14m[1m [0m[38;5;14m[1mEngineering[0m[38;5;14m[1m [0m[38;5;14m[1mwith[0m[38;5;14m[1m [0m[38;5;14m[1mApache[0m[38;5;14m[1m [0m[38;5;14m[1mSpark[0m[38;5;14m[1m [0m[38;5;14m[1m(edX[0m[38;5;14m[1m [0m[38;5;14m[1mXSeries)[0m[38;5;12m [39m[38;5;12m(https://www.edx.org/xseries/data-science-engineering-apache-spark)[39m[38;5;12m [39m[38;5;12m-[39m[38;5;12m [39m[38;5;12mSeries[39m[38;5;12m [39m[38;5;12mof[39m[38;5;12m [39m[38;5;12mfive[39m[38;5;12m [39m[38;5;12mcourses[39m[38;5;12m [39m[38;5;12m([39m[38;5;14m[1mIntroduction[0m[38;5;14m[1m [0m[38;5;14m[1mto[0m[38;5;14m[1m [0m[38;5;14m[1mApache[0m[38;5;14m[1m [0m[38;5;14m[1mSpark[0m[38;5;12m [39m
|
||
[38;5;12m(https://www.edx.org/course/introduction-apache-spark-uc-berkeleyx-cs105x),[39m[38;5;12m [39m[38;5;14m[1mDistributed[0m[38;5;14m[1m [0m[38;5;14m[1mMachine[0m[38;5;14m[1m [0m[38;5;14m[1mLearning[0m[38;5;14m[1m [0m[38;5;14m[1mwith[0m[38;5;14m[1m [0m[38;5;14m[1mApache[0m[38;5;14m[1m [0m[38;5;14m[1mSpark[0m[38;5;12m [39m[38;5;12m(https://www.edx.org/course/distributed-machine-learning-apache-uc-berkeleyx-cs120x),[39m[38;5;12m [39m[38;5;14m[1mBig[0m[38;5;14m[1m [0m[38;5;14m[1mData[0m[38;5;14m[1m [0m[38;5;14m[1mAnalysis[0m[38;5;14m[1m [0m[38;5;14m[1mwith[0m[38;5;14m[1m [0m
|
||
[38;5;14m[1mApache[0m[38;5;14m[1m [0m[38;5;14m[1mSpark[0m[38;5;12m [39m[38;5;12m(https://www.edx.org/course/big-data-analysis-apache-spark-uc-berkeleyx-cs110x),[39m[38;5;12m [39m[38;5;14m[1mAdvanced[0m[38;5;14m[1m [0m[38;5;14m[1mApache[0m[38;5;14m[1m [0m[38;5;14m[1mSpark[0m[38;5;14m[1m [0m[38;5;14m[1mfor[0m[38;5;14m[1m [0m[38;5;14m[1mData[0m[38;5;14m[1m [0m[38;5;14m[1mScience[0m[38;5;14m[1m [0m[38;5;14m[1mand[0m[38;5;14m[1m [0m[38;5;14m[1mData[0m[38;5;14m[1m [0m[38;5;14m[1mEngineering[0m[38;5;12m [39m
|
||
[38;5;12m(https://www.edx.org/course/advanced-apache-spark-data-science-data-uc-berkeleyx-cs115x),[39m[38;5;12m [39m[38;5;14m[1mAdvanced[0m[38;5;14m[1m [0m[38;5;14m[1mDistributed[0m[38;5;14m[1m [0m[38;5;14m[1mMachine[0m[38;5;14m[1m [0m[38;5;14m[1mLearning[0m[38;5;14m[1m [0m[38;5;14m[1mwith[0m[38;5;14m[1m [0m[38;5;14m[1mApache[0m[38;5;14m[1m [0m[38;5;14m[1mSpark[0m[38;5;12m [39m[38;5;12m(https://www.edx.org/course/advanced-distributed-machine-learning-uc-berkeleyx-cs125x))[39m[38;5;12m [39m
|
||
[38;5;12mcovering[39m[38;5;12m [39m[38;5;12mdifferent[39m[38;5;12m [39m[38;5;12maspects[39m[38;5;12m [39m[38;5;12mof[39m[38;5;12m [39m[38;5;12msoftware[39m[38;5;12m [39m[38;5;12mengineering[39m[38;5;12m [39m[38;5;12mand[39m[38;5;12m [39m[38;5;12mdata[39m[38;5;12m [39m[38;5;12mscience.[39m[38;5;12m [39m[38;5;12mPython[39m[38;5;12m [39m[38;5;12moriented.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mBig[0m[38;5;14m[1m [0m[38;5;14m[1mData[0m[38;5;14m[1m [0m[38;5;14m[1mAnalysis[0m[38;5;14m[1m [0m[38;5;14m[1mwith[0m[38;5;14m[1m [0m[38;5;14m[1mScala[0m[38;5;14m[1m [0m[38;5;14m[1mand[0m[38;5;14m[1m [0m[38;5;14m[1mSpark[0m[38;5;14m[1m [0m[38;5;14m[1m(Coursera)[0m[38;5;12m [39m[38;5;12m(https://www.coursera.org/learn/big-data-analysys)[39m[38;5;12m [39m[38;5;12m-[39m[38;5;12m [39m[38;5;12mScala[39m[38;5;12m [39m[38;5;12moriented[39m[38;5;12m [39m[38;5;12mintroductory[39m[38;5;12m [39m[38;5;12mcourse.[39m[38;5;12m [39m[38;5;12mPart[39m[38;5;12m [39m[38;5;12mof[39m[38;5;12m [39m[38;5;14m[1mFunctional[0m[38;5;14m[1m [0m[38;5;14m[1mProgramming[0m[38;5;14m[1m [0m[38;5;14m[1min[0m[38;5;14m[1m [0m[38;5;14m[1mScala[0m[38;5;14m[1m [0m[38;5;14m[1mSpecialization[0m[38;5;12m [39m
|
||
[38;5;12m(https://www.coursera.org/specializations/scala).[39m
|
||
|
||
[38;2;255;187;0m[4mWorkshops[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mAMP[0m[38;5;14m[1m [0m[38;5;14m[1mCamp[0m[38;5;12m [39m[38;5;12m(http://ampcamp.berkeley.edu)[39m[38;5;12m [39m[38;5;12m-[39m[38;5;12m [39m[38;5;12mPeriodical[39m[38;5;12m [39m[38;5;12mtraining[39m[38;5;12m [39m[38;5;12mevent[39m[38;5;12m [39m[38;5;12morganized[39m[38;5;12m [39m[38;5;12mby[39m[38;5;12m [39m[38;5;12mthe[39m[38;5;12m [39m[38;5;14m[1mUC[0m[38;5;14m[1m [0m[38;5;14m[1mBerkeley[0m[38;5;14m[1m [0m[38;5;14m[1mAMPLab[0m[38;5;12m [39m[38;5;12m(https://amplab.cs.berkeley.edu/).[39m[38;5;12m [39m[38;5;12mA[39m[38;5;12m [39m[38;5;12msource[39m[38;5;12m [39m[38;5;12mof[39m[38;5;12m [39m[38;5;12museful[39m[38;5;12m [39m[38;5;12mexercise[39m[38;5;12m [39m[38;5;12mand[39m[38;5;12m [39m[38;5;12mrecorded[39m[38;5;12m [39m[38;5;12mworkshops[39m[38;5;12m [39m[38;5;12mcovering[39m[38;5;12m [39m[38;5;12mdifferent[39m[38;5;12m [39m[38;5;12mtools[39m[38;5;12m [39m[38;5;12mfrom[39m[38;5;12m [39m[38;5;12mthe[39m[38;5;12m [39m[38;5;14m[1mBerkeley[0m[38;5;14m[1m [0m
|
||
[38;5;14m[1mData[0m[38;5;14m[1m [0m[38;5;14m[1mAnalytics[0m[38;5;14m[1m [0m[38;5;14m[1mStack[0m[38;5;12m [39m[38;5;12m(https://amplab.cs.berkeley.edu/software/).[39m
|
||
|
||
[38;2;255;187;0m[4mProjects Using Spark[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mOryx[0m[38;5;14m[1m [0m[38;5;14m[1m2[0m[38;5;12m [39m[38;5;12m(https://github.com/OryxProject/oryx)[39m[38;5;12m [39m[38;5;12m-[39m[38;5;12m [39m[38;5;14m[1mLambda[0m[38;5;14m[1m [0m[38;5;14m[1marchitecture[0m[38;5;12m [39m[38;5;12m(http://lambda-architecture.net/)[39m[38;5;12m [39m[38;5;12mplatform[39m[38;5;12m [39m[38;5;12mbuilt[39m[38;5;12m [39m[38;5;12mon[39m[38;5;12m [39m[38;5;12mApache[39m[38;5;12m [39m[38;5;12mSpark[39m[38;5;12m [39m[38;5;12mand[39m[38;5;12m [39m[38;5;14m[1mApache[0m[38;5;14m[1m [0m[38;5;14m[1mKafka[0m[38;5;12m [39m[38;5;12m(http://kafka.apache.org/)[39m[38;5;12m [39m[38;5;12mwith[39m[38;5;12m [39m[38;5;12mspecialization[39m[38;5;12m [39m[38;5;12mfor[39m[38;5;12m [39m[38;5;12mreal-time[39m[38;5;12m [39m[38;5;12mlarge[39m[38;5;12m [39m[38;5;12mscale[39m[38;5;12m [39m[38;5;12mmachine[39m[38;5;12m [39m
|
||
[38;5;12mlearning.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mPhoton ML[0m[38;5;12m (https://github.com/linkedin/photon-ml) - A machine learning library supporting classical Generalized Mixed Model and Generalized Additive Mixed Effect Model.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mPredictionIO[0m[38;5;12m (https://prediction.io/) - Machine Learning server for developers and data scientists to build and deploy predictive applications in a fraction of the time.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mCrossdata[0m[38;5;12m (https://github.com/Stratio/Crossdata) - Data integration platform with extended DataSource API and multi-user environment.[39m
|
||
|
||
|
||
[38;2;255;187;0m[4mDocker Images[0m
|
||
|
||
[38;5;12m- [39m[38;5;14m[1mapache/spark[0m[38;5;12m (https://hub.docker.com/r/apache/spark) - Apache Spark Official Docker images.[39m
|
||
[38;5;12m- [39m[38;5;14m[1mjupyter/docker-stacks/pyspark-notebook[0m[38;5;12m (https://github.com/jupyter/docker-stacks/tree/master/pyspark-notebook) - PySpark with Jupyter Notebook and Mesos client.[39m
|
||
[38;5;12m- [39m[38;5;14m[1msequenceiq/docker-spark[0m[38;5;12m (https://github.com/sequenceiq/docker-spark) - Yarn images from [39m[38;5;14m[1mSequenceIQ[0m[38;5;12m (http://www.sequenceiq.com/).[39m
|
||
[38;5;12m- [39m[38;5;14m[1mdatamechanics/spark[0m[38;5;12m (https://hub.docker.com/r/datamechanics/spark) - An easy to setup Docker image for Apache Spark from [39m[38;5;14m[1mData Mechanics[0m[38;5;12m (https://www.datamechanics.co/).[39m
|
||
|
||
[38;2;255;187;0m[4mMiscellaneous[0m
|
||
|
||
[38;5;12m- [39m[38;5;14m[1mSpark with Scala Gitter channel[0m[38;5;12m (https://gitter.im/spark-scala/Lobby) - "_A place to discuss and ask questions about using Scala for Spark programming_" started by [39m[38;5;14m[1m@deanwampler[0m[38;5;12m (https://github.com/deanwampler).[39m
|
||
[38;5;12m-[39m[38;5;12m [39m[38;5;14m[1mApache[0m[38;5;14m[1m [0m[38;5;14m[1mSpark[0m[38;5;14m[1m [0m[38;5;14m[1mUser[0m[38;5;14m[1m [0m[38;5;14m[1mList[0m[38;5;12m [39m[38;5;12m(http://apache-spark-user-list.1001560.n3.nabble.com/)[39m[38;5;12m [39m[38;5;12mand[39m[38;5;12m [39m[38;5;14m[1mApache[0m[38;5;14m[1m [0m[38;5;14m[1mSpark[0m[38;5;14m[1m [0m[38;5;14m[1mDevelopers[0m[38;5;14m[1m [0m[38;5;14m[1mList[0m[38;5;12m [39m[38;5;12m(http://apache-spark-developers-list.1001551.n3.nabble.com/)[39m[38;5;12m [39m[38;5;12m-[39m[38;5;12m [39m[38;5;12mMailing[39m[38;5;12m [39m[38;5;12mlists[39m[38;5;12m [39m[38;5;12mdedicated[39m[38;5;12m [39m[38;5;12mto[39m[38;5;12m [39m[38;5;12musage[39m[38;5;12m [39m[38;5;12mquestions[39m[38;5;12m [39m[38;5;12mand[39m[38;5;12m [39m[38;5;12mdevelopment[39m[38;5;12m [39m
|
||
[38;5;12mtopics[39m[38;5;12m [39m[38;5;12mrespectively.[39m
|
||
|
||
[38;2;255;187;0m[4mReferences[0m
|
||
|
||
|
||
|
||
[38;2;255;187;0m[4mLicense[0m
|
||
|
||
|
||
|
||
[38;5;12m<img src="https://mirrors.creativecommons.org/presskit/buttons/88x31/svg/publicdomain.svg"[39m
|
||
[48;5;235m[38;5;249m style="border-style: none;" alt="Public Domain Mark" />[49m[39m
|
||
|
||
|
||
[38;5;12mThis work (, is free of known copyright restrictions.[39m
|
||
|
||
|
||
[38;5;12mApache Spark, Spark, Apache, and the Spark logo are of[39m
|
||
[38;5;12m . This compilation is not endorsed by The Apache Software Foundation.[39m
|
||
|
||
|
||
[38;5;12mInspired by [39m[38;5;14m[1msindresorhus/awesome[0m[38;5;12m (https://github.com/sindresorhus/awesome).[39m
|