206 lines
32 KiB
Plaintext
206 lines
32 KiB
Plaintext
[38;5;12m (https://spark.apache.org/)[39m
|
||
|
||
[38;5;12m [39m[38;2;255;187;0m[1m[4mAwesome Spark [0m[38;5;14m[1m[4m![0m[38;2;255;187;0m[1m[4mAwesome[0m[38;5;14m[1m[4m (https://cdn.rawgit.com/sindresorhus/awesome/d7305f38d29fed78fa85652e3a63e154dd8e8829/media/badge.svg)[0m[38;2;255;187;0m[1m[4m (https://github.com/sindresorhus/awesome)[0m
|
||
|
||
[38;5;12mA curated list of awesome [39m[38;5;14m[1mApache Spark[0m[38;5;12m (https://spark.apache.org/) packages and resources.[39m
|
||
|
||
[38;5;12m_Apache[39m[38;5;12m [39m[38;5;12mSpark[39m[38;5;12m [39m[38;5;12mis[39m[38;5;12m [39m[38;5;12man[39m[38;5;12m [39m[38;5;12mopen-source[39m[38;5;12m [39m[38;5;12mcluster-computing[39m[38;5;12m [39m[38;5;12mframework.[39m[38;5;12m [39m[38;5;12mOriginally[39m[38;5;12m [39m[38;5;12mdeveloped[39m[38;5;12m [39m[38;5;12mat[39m[38;5;12m [39m[38;5;12mthe[39m[38;5;12m [39m[38;5;14m[1mUniversity[0m[38;5;14m[1m [0m[38;5;14m[1mof[0m[38;5;14m[1m [0m[38;5;14m[1mCalifornia[0m[38;5;12m [39m[38;5;12m(https://www.universityofcalifornia.edu/),[39m[38;5;12m [39m[38;5;14m[1mBerkeley's[0m[38;5;14m[1m [0m[38;5;14m[1mAMPLab[0m[38;5;12m [39m[38;5;12m(https://amplab.cs.berkeley.edu/),[39m[38;5;12m [39m[38;5;12mthe[39m[38;5;12m [39m[38;5;12mSpark[39m[38;5;12m [39m[38;5;12mcodebase[39m[38;5;12m [39m[38;5;12mwas[39m[38;5;12m [39m[38;5;12mlater[39m[38;5;12m [39m[38;5;12mdonated[39m
|
||
[38;5;12mto[39m[38;5;12m [39m[38;5;12mthe[39m[38;5;12m [39m[38;5;14m[1mApache[0m[38;5;14m[1m [0m[38;5;14m[1mSoftware[0m[38;5;14m[1m [0m[38;5;14m[1mFoundation[0m[38;5;12m [39m[38;5;12m(https://www.apache.org/),[39m[38;5;12m [39m[38;5;12mwhich[39m[38;5;12m [39m[38;5;12mhas[39m[38;5;12m [39m[38;5;12mmaintained[39m[38;5;12m [39m[38;5;12mit[39m[38;5;12m [39m[38;5;12msince.[39m[38;5;12m [39m[38;5;12mSpark[39m[38;5;12m [39m[38;5;12mprovides[39m[38;5;12m [39m[38;5;12man[39m[38;5;12m [39m[38;5;12minterface[39m[38;5;12m [39m[38;5;12mfor[39m[38;5;12m [39m[38;5;12mprogramming[39m[38;5;12m [39m[38;5;12mentire[39m[38;5;12m [39m[38;5;12mclusters[39m[38;5;12m [39m[38;5;12mwith[39m[38;5;12m [39m[38;5;12mimplicit[39m[38;5;12m [39m[38;5;12mdata[39m[38;5;12m [39m[38;5;12mparallelism[39m[38;5;12m [39m[38;5;12mand[39m[38;5;12m [39m[38;5;12mfault-tolerance_[39m[38;5;12m [39m[38;5;12m([39m[38;5;14m[1mWikipedia[0m[38;5;14m[1m [0m[38;5;14m[1m2017[0m[38;5;12m [39m[38;5;12m(#wikipedia-2017)).[39m
|
||
|
||
[38;5;12mUsers of Apache Spark may choose between different the Python, R, Scala and Java programming languages to interface with the Apache Spark APIs.[39m
|
||
|
||
[38;2;255;187;0m[4mPackages[0m
|
||
|
||
[38;2;255;187;0m[4mLanguage Bindings[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mKotlin for Apache Spark[0m[38;5;12m (https://github.com/Kotlin/kotlin-spark-api) - Kotlin API bindings and extensions.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1m.NET for Apache Spark[0m[38;5;12m (https://github.com/dotnet/spark) - .NET bindings.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1msparklyr[0m[38;5;12m (https://github.com/rstudio/sparklyr) - An alternative R backend, using [39m[48;5;235m[38;5;249m[1mdplyr[0m[38;5;12m (https://github.com/hadley/dplyr).[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1msparkle[0m[38;5;12m (https://github.com/tweag/sparkle) - Haskell on Apache Spark.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mspark-connect-rs[0m[38;5;12m (https://github.com/sjrusso8/spark-connect-rs) - Rust bindings.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mspark-connect-go[0m[38;5;12m (https://github.com/apache/spark-connect-go) - Golang bindings.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mspark-connect-csharp[0m[38;5;12m (https://github.com/mdrakiburrahman/spark-connect-csharp) - C# bindings.[39m
|
||
|
||
[38;2;255;187;0m[4mNotebooks and IDEs[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1malmond[0m[38;5;12m (https://almond.sh/) - A scala kernel for [39m[38;5;14m[1mJupyter[0m[38;5;12m (https://jupyter.org/).[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mApache Zeppelin[0m[38;5;12m (https://zeppelin.incubator.apache.org/) - Web-based notebook that enables interactive data analytics with plugable backends, integrated plotting, and extensive Spark support out-of-the-box.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mPolynote[0m[38;5;12m [39m[38;5;12m(https://polynote.org/)[39m[38;5;12m [39m[38;5;12m-[39m[38;5;12m [39m[38;5;12mPolynote:[39m[38;5;12m [39m[38;5;12man[39m[38;5;12m [39m[38;5;12mIDE-inspired[39m[38;5;12m [39m[38;5;12mpolyglot[39m[38;5;12m [39m[38;5;12mnotebook.[39m[38;5;12m [39m[38;5;12mIt[39m[38;5;12m [39m[38;5;12msupports[39m[38;5;12m [39m[38;5;12mmixing[39m[38;5;12m [39m[38;5;12mmultiple[39m[38;5;12m [39m[38;5;12mlanguages[39m[38;5;12m [39m[38;5;12min[39m[38;5;12m [39m[38;5;12mone[39m[38;5;12m [39m[38;5;12mnotebook,[39m[38;5;12m [39m[38;5;12mand[39m[38;5;12m [39m[38;5;12msharing[39m[38;5;12m [39m[38;5;12mdata[39m[38;5;12m [39m[38;5;12mbetween[39m[38;5;12m [39m[38;5;12mthem[39m[38;5;12m [39m[38;5;12mseamlessly.[39m[38;5;12m [39m[38;5;12mIt[39m[38;5;12m [39m[38;5;12mencourages[39m[38;5;12m [39m[38;5;12mreproducible[39m[38;5;12m [39m[38;5;12mnotebooks[39m[38;5;12m [39m[38;5;12mwith[39m[38;5;12m [39m[38;5;12mits[39m[38;5;12m [39m[38;5;12mimmutable[39m[38;5;12m [39m[38;5;12mdata[39m[38;5;12m [39m
|
||
[38;5;12mmodel.[39m[38;5;12m [39m[38;5;12mOriginating[39m[38;5;12m [39m[38;5;12mfrom[39m[38;5;12m [39m[38;5;14m[1mNetflix[0m[38;5;12m [39m[38;5;12m(https://medium.com/netflix-techblog/open-sourcing-polynote-an-ide-inspired-polyglot-notebook-7f929d3f447).[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1msparkmagic[0m[38;5;12m [39m[38;5;12m(https://github.com/jupyter-incubator/sparkmagic)[39m[38;5;12m [39m[38;5;12m-[39m[38;5;12m [39m[38;5;14m[1mJupyter[0m[38;5;12m [39m[38;5;12m(https://jupyter.org/)[39m[38;5;12m [39m[38;5;12mmagics[39m[38;5;12m [39m[38;5;12mand[39m[38;5;12m [39m[38;5;12mkernels[39m[38;5;12m [39m[38;5;12mfor[39m[38;5;12m [39m[38;5;12mworking[39m[38;5;12m [39m[38;5;12mwith[39m[38;5;12m [39m[38;5;12mremote[39m[38;5;12m [39m[38;5;12mSpark[39m[38;5;12m [39m[38;5;12mclusters,[39m[38;5;12m [39m[38;5;12mfor[39m[38;5;12m [39m[38;5;12minteractively[39m[38;5;12m [39m[38;5;12mworking[39m[38;5;12m [39m[38;5;12mwith[39m[38;5;12m [39m[38;5;12mremote[39m[38;5;12m [39m[38;5;12mSpark[39m[38;5;12m [39m[38;5;12mclusters[39m[38;5;12m [39m[38;5;12mthrough[39m[38;5;12m [39m[38;5;14m[1mLivy[0m[38;5;12m [39m
|
||
[38;5;12m(https://github.com/cloudera/livy),[39m[38;5;12m [39m[38;5;12min[39m[38;5;12m [39m[38;5;12mJupyter[39m[38;5;12m [39m[38;5;12mnotebooks.[39m
|
||
|
||
[38;2;255;187;0m[4mGeneral Purpose Libraries[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mitachi[0m[38;5;12m (https://github.com/yaooqinn/itachi) - A library that brings useful functions from modern database management systems to Apache Spark.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mspark-daria[0m[38;5;12m (https://github.com/mrpowers-io/spark-daria) - A Scala library with essential Spark functions and extensions to make you more productive.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mquinn[0m[38;5;12m (https://github.com/mrpowers-io/quinn) - A native PySpark implementation of spark-daria.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mApache DataFu[0m[38;5;12m (https://github.com/apache/datafu/tree/master/datafu-spark) - A library of general purpose functions and UDF's.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mJoblib Apache Spark Backend[0m[38;5;12m (https://github.com/joblib/joblib-spark) - [39m[48;5;235m[38;5;249m[1mjoblib[0m[38;5;12m (https://github.com/joblib/joblib) backend for running tasks on Spark clusters.[39m
|
||
|
||
[38;2;255;187;0m[4mSQL Data Sources[0m
|
||
|
||
[38;5;12mSparkSQL[39m[38;5;12m [39m[38;5;12mhas[39m[38;5;12m [39m[38;5;14m[1mserveral[0m[38;5;14m[1m [0m[38;5;14m[1mbuilt-in[0m[38;5;14m[1m [0m[38;5;14m[1mData[0m[38;5;14m[1m [0m[38;5;14m[1mSources[0m[38;5;12m [39m[38;5;12m(https://spark.apache.org/docs/latest/sql-data-sources-load-save-functions.html#manually-specifying-options)[39m[38;5;12m [39m[38;5;12mfor[39m[38;5;12m [39m[38;5;12mfiles.[39m[38;5;12m [39m[38;5;12mThese[39m[38;5;12m [39m[38;5;12minclude[39m[38;5;12m [39m[48;5;235m[38;5;249mcsv[49m[39m[38;5;12m,[39m[38;5;12m [39m[48;5;235m[38;5;249mjson[49m[39m[38;5;12m,[39m[38;5;12m [39m[48;5;235m[38;5;249mparquet[49m[39m[38;5;12m,[39m[38;5;12m [39m[48;5;235m[38;5;249morc[49m[39m[38;5;12m,[39m[38;5;12m [39m[38;5;12mand[39m[38;5;12m [39m[48;5;235m[38;5;249mavro[49m[39m[38;5;12m.[39m[38;5;12m [39m[38;5;12mIt[39m[38;5;12m [39m[38;5;12malso[39m[38;5;12m [39m[38;5;12msupports[39m[38;5;12m [39m[38;5;12mJDBC[39m[38;5;12m [39m[38;5;12mdatabases[39m[38;5;12m [39m
|
||
[38;5;12mas[39m[38;5;12m [39m[38;5;12mwell[39m[38;5;12m [39m[38;5;12mas[39m[38;5;12m [39m[38;5;12mApache[39m[38;5;12m [39m[38;5;12mHive.[39m[38;5;12m [39m[38;5;12mAdditional[39m[38;5;12m [39m[38;5;12mdata[39m[38;5;12m [39m[38;5;12msources[39m[38;5;12m [39m[38;5;12mcan[39m[38;5;12m [39m[38;5;12mbe[39m[38;5;12m [39m[38;5;12madded[39m[38;5;12m [39m[38;5;12mby[39m[38;5;12m [39m[38;5;12mincluding[39m[38;5;12m [39m[38;5;12mthe[39m[38;5;12m [39m[38;5;12mpackages[39m[38;5;12m [39m[38;5;12mlisted[39m[38;5;12m [39m[38;5;12mbelow,[39m[38;5;12m [39m[38;5;12mor[39m[38;5;12m [39m[38;5;12mwriting[39m[38;5;12m [39m[38;5;12myour[39m[38;5;12m [39m[38;5;12mown.[39m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mSpark XML[0m[38;5;12m (https://github.com/databricks/spark-xml) - XML parser and writer.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mSpark Cassandra Connector[0m[38;5;12m (https://github.com/datastax/spark-cassandra-connector) - Cassandra support including data source and API and support for arbitrary queries.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mMongo-Spark[0m[38;5;12m (https://github.com/mongodb/mongo-spark) - Official MongoDB connector.[39m
|
||
|
||
[38;2;255;187;0m[4mStorage[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mDelta Lake[0m[38;5;12m (https://github.com/delta-io/delta) - Storage layer with ACID transactions.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mApache Hudi[0m[38;5;12m (https://github.com/apache/hudi) - Upserts, Deletes And Incremental Processing on Big Data..[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mApache Iceberg[0m[38;5;12m (https://github.com/apache/iceberg) - Upserts, Deletes And Incremental Processing on Big Data..[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mlakeFS[0m[38;5;12m (https://docs.lakefs.io/integrations/spark.html) - Integration with the lakeFS atomic versioned storage layer.[39m
|
||
|
||
[38;2;255;187;0m[4mBioinformatics[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mADAM[0m[38;5;12m (https://github.com/bigdatagenomics/adam) - Set of tools designed to analyse genomics data.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mHail[0m[38;5;12m (https://github.com/hail-is/hail) - Genetic analysis framework.[39m
|
||
|
||
[38;2;255;187;0m[4mGIS[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mApache Sedona[0m[38;5;12m (https://github.com/apache/incubator-sedona) - Cluster computing system for processing large-scale spatial data.[39m
|
||
|
||
[38;2;255;187;0m[4mGraph Processing[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mGraphFrames[0m[38;5;12m (https://github.com/graphframes/graphframes) - Data frame based graph API.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mneo4j-spark-connector[0m[38;5;12m (https://github.com/neo4j-contrib/neo4j-spark-connector) - Bolt protocol based, Neo4j Connector with RDD, DataFrame and GraphX / GraphFrames support.[39m
|
||
|
||
[38;2;255;187;0m[4mMachine Learning Extension[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mApache SystemML[0m[38;5;12m (https://systemml.apache.org/) - Declarative machine learning framework on top of Spark.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mMahout Spark Bindings[0m[38;5;12m (https://mahout.apache.org/users/sparkbindings/home.html) [39m[38;5;12m*[39m[48;2;30;30;40m[38;5;13m[3mstatus unknown[0m[48;2;30;30;40m[38;5;13m[3m*[0m[38;5;12m - linear algebra DSL and optimizer with R-like syntax.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mKeystoneML[0m[38;5;12m (http://keystone-ml.org/) - Type safe machine learning pipelines with RDDs.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mJPMML-Spark[0m[38;5;12m (https://github.com/jpmml/jpmml-spark) - PMML transformer library for Spark ML.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mModelDB[0m[38;5;12m (https://mitdbg.github.io/modeldb) .[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mSparkling Water[0m[38;5;12m (https://github.com/h2oai/sparkling-water) - [39m[38;5;14m[1mH2O[0m[38;5;12m (http://www.h2o.ai/) interoperability layer.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mBigDL[0m[38;5;12m (https://github.com/intel-analytics/BigDL) - Distributed Deep Learning library.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mMLeap[0m[38;5;12m (https://github.com/combust/mleap) - Execution engine and serialization format which supports deployment of [39m[48;5;235m[38;5;249mo.a.s.ml[49m[39m[38;5;12m models without dependency on [39m[48;5;235m[38;5;249mSparkSession[49m[39m[38;5;12m.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mMicrosoft ML for Apache Spark[0m[38;5;12m (https://github.com/Azure/mmlspark) - A distributed ml library with support for LightGBM, Vowpal Wabbit, OpenCV, Deep Learning, Cognitive Services, and Model Deployment.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mMLflow[0m[38;5;12m (https://mlflow.org/docs/latest/python_api/mlflow.spark.html#module-mlflow.spark) - Machine learning orchestration platform. [39m
|
||
|
||
[38;2;255;187;0m[4mMiddleware[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mLivy[0m[38;5;12m (https://github.com/apache/incubator-livy) - REST server with extensive language support (Python, R, Scala), ability to maintain interactive sessions and object sharing.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mspark-jobserver[0m[38;5;12m (https://github.com/spark-jobserver/spark-jobserver) - Simple Spark as a Service which supports objects sharing using so called named objects. JVM only.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mApache Toree[0m[38;5;12m (https://github.com/apache/incubator-toree) - IPython protocol based middleware for interactive applications.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mApache Kyuubi[0m[38;5;12m (https://github.com/apache/kyuubi) - A distributed multi-tenant JDBC server for large-scale data processing and analytics, built on top of Apache Spark.[39m
|
||
|
||
[38;2;255;187;0m[4mMonitoring[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mData Mechanics Delight[0m[38;5;12m (https://github.com/datamechanics/delight) - Cross-platform monitoring tool (Spark UI / Spark History Server replacement).[39m
|
||
|
||
[38;2;255;187;0m[4mUtilities[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1msparkly[0m[38;5;12m (https://github.com/Tubular/sparkly) - Helpers & syntactic sugar for PySpark.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mFlintrock[0m[38;5;12m (https://github.com/nchammas/flintrock) - A command-line tool for launching Spark clusters on EC2.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mOptimus[0m[38;5;12m (https://github.com/ironmussa/Optimus/) - Data Cleansing and Exploration utilities with the goal of simplifying data cleaning.[39m
|
||
|
||
[38;2;255;187;0m[4mNatural Language Processing[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mspark-nlp[0m[38;5;12m (https://github.com/JohnSnowLabs/spark-nlp) - Natural language processing library built on top of Apache Spark ML.[39m
|
||
|
||
[38;2;255;187;0m[4mStreaming[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mApache Bahir[0m[38;5;12m (https://bahir.apache.org/) - Collection of the streaming connectors excluded from Spark 2.0 (Akka, MQTT, Twitter. ZeroMQ).[39m
|
||
|
||
[38;2;255;187;0m[4mInterfaces[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mApache Beam[0m[38;5;12m (https://beam.apache.org/) - Unified data processing engine supporting both batch and streaming applications. Apache Spark is one of the supported execution environments.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mKoalas[0m[38;5;12m (https://github.com/databricks/koalas) - Pandas DataFrame API on top of Apache Spark.[39m
|
||
|
||
[38;2;255;187;0m[4mData quality[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mdeequ[0m[38;5;12m (https://github.com/awslabs/deequ) - Deequ is a library built on top of Apache Spark for defining "unit tests for data", which measure data quality in large datasets.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mpython-deequ[0m[38;5;12m (https://github.com/awslabs/python-deequ) - Python API for Deequ.[39m
|
||
|
||
[38;2;255;187;0m[4mTesting[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mspark-testing-base[0m[38;5;12m (https://github.com/holdenk/spark-testing-base) - Collection of base test classes.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mspark-fast-tests[0m[38;5;12m (https://github.com/mrpowers-io/spark-fast-tests) - A lightweight and fast testing framework.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mchispa[0m[38;5;12m (https://github.com/MrPowers/chispa) - PySpark test helpers with beautiful error messages.[39m
|
||
|
||
[38;2;255;187;0m[4mWeb Archives[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mArchives Unleashed Toolkit[0m[38;5;12m (https://github.com/archivesunleashed/aut) - Open-source toolkit for analyzing web archives.[39m
|
||
|
||
[38;2;255;187;0m[4mWorkflow Management[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mCromwell[0m[38;5;12m (https://github.com/broadinstitute/cromwell#spark-backend) - Workflow management system with [39m[38;5;14m[1mSpark backend[0m[38;5;12m (https://github.com/broadinstitute/cromwell#spark-backend).[39m
|
||
|
||
[38;2;255;187;0m[4mResources[0m
|
||
|
||
[38;2;255;187;0m[4mBooks[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mLearning Spark, 2nd Edition[0m[38;5;12m (https://www.oreilly.com/library/view/learning-spark-2nd/9781492050032/) - Introduction to Spark API with Spark 3.0 covered. Good source of knowledge about basic concepts.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mAdvanced Analytics with Spark[0m[38;5;12m (http://shop.oreilly.com/product/0636920035091.do) - Useful collection of Spark processing patterns. Accompanying GitHub repository: [39m[38;5;14m[1msryza/aas[0m[38;5;12m (https://github.com/sryza/aas).[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mMastering Apache Spark[0m[38;5;12m (https://jaceklaskowski.gitbooks.io/mastering-apache-spark/) - Interesting compilation of notes by [39m[38;5;14m[1mJacek Laskowski[0m[38;5;12m (https://github.com/jaceklaskowski). Focused on different aspects of Spark internals.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mSpark[0m[38;5;14m[1m [0m[38;5;14m[1min[0m[38;5;14m[1m [0m[38;5;14m[1mAction[0m[38;5;12m [39m[38;5;12m(https://www.manning.com/books/spark-in-action)[39m[38;5;12m [39m[38;5;12m-[39m[38;5;12m [39m[38;5;12mNew[39m[38;5;12m [39m[38;5;12mbook[39m[38;5;12m [39m[38;5;12min[39m[38;5;12m [39m[38;5;12mthe[39m[38;5;12m [39m[38;5;12mManning's[39m[38;5;12m [39m[38;5;12m"in[39m[38;5;12m [39m[38;5;12maction"[39m[38;5;12m [39m[38;5;12mfamily[39m[38;5;12m [39m[38;5;12mwith[39m[38;5;12m [39m[38;5;12m+400[39m[38;5;12m [39m[38;5;12mpages.[39m[38;5;12m [39m[38;5;12mStarts[39m[38;5;12m [39m[38;5;12mgently,[39m[38;5;12m [39m[38;5;12mstep-by-step[39m[38;5;12m [39m[38;5;12mand[39m[38;5;12m [39m[38;5;12mcovers[39m[38;5;12m [39m[38;5;12mlarge[39m[38;5;12m [39m[38;5;12mnumber[39m[38;5;12m [39m[38;5;12mof[39m[38;5;12m [39m[38;5;12mtopics.[39m[38;5;12m [39m[38;5;12mFree[39m[38;5;12m [39m[38;5;12mexcerpt[39m[38;5;12m [39m[38;5;12mon[39m[38;5;12m [39m[38;5;12mhow[39m[38;5;12m [39m[38;5;12mto[39m[38;5;12m [39m[38;5;14m[1msetup[0m[38;5;14m[1m [0m[38;5;14m[1mEclipse[0m[38;5;14m[1m [0m[38;5;14m[1mfor[0m[38;5;14m[1m [0m[38;5;14m[1mSpark[0m[38;5;14m[1m [0m
|
||
[38;5;14m[1mapplication[0m[38;5;14m[1m [0m[38;5;14m[1mdevelopment[0m[38;5;12m [39m[38;5;12m(http://freecontent.manning.com/how-to-start-developing-spark-applications-in-eclipse/)[39m[38;5;12m [39m[38;5;12mand[39m[38;5;12m [39m[38;5;12mhow[39m[38;5;12m [39m[38;5;12mto[39m[38;5;12m [39m[38;5;12mbootstrap[39m[38;5;12m [39m[38;5;12ma[39m[38;5;12m [39m[38;5;12mnew[39m[38;5;12m [39m[38;5;12mapplication[39m[38;5;12m [39m[38;5;12musing[39m[38;5;12m [39m[38;5;12mthe[39m[38;5;12m [39m[38;5;12mprovided[39m[38;5;12m [39m[38;5;12mMaven[39m[38;5;12m [39m[38;5;12mArchetype.[39m[38;5;12m [39m[38;5;12mYou[39m[38;5;12m [39m[38;5;12mcan[39m[38;5;12m [39m[38;5;12mfind[39m[38;5;12m [39m[38;5;12mthe[39m[38;5;12m [39m[38;5;12maccompanying[39m[38;5;12m [39m[38;5;12mGitHub[39m[38;5;12m [39m[38;5;12mrepo[39m[38;5;12m [39m[38;5;14m[1mhere[0m[38;5;12m [39m
|
||
[38;5;12m(https://github.com/spark-in-action/first-edition).[39m
|
||
|
||
[38;2;255;187;0m[4mPapers[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mLarge-Scale Intelligent Microservices[0m[38;5;12m (https://arxiv.org/pdf/2009.08044.pdf) - Microsoft paper that presents an Apache Spark-based micro-service orchestration framework that extends database operations to include web service primitives.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mResilient Distributed Datasets: A Fault-Tolerant Abstraction for In-Memory Cluster Computing[0m[38;5;12m (https://people.csail.mit.edu/matei/papers/2012/nsdi_spark.pdf) - Paper introducing a core distributed memory abstraction.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mSpark SQL: Relational Data Processing in Spark[0m[38;5;12m (https://amplab.cs.berkeley.edu/wp-content/uploads/2015/03/SparkSQLSigmod2015.pdf) - Paper introducing relational underpinnings, code generation and Catalyst optimizer.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mStructured Streaming: A Declarative API for Real-Time Applications in Apache Spark[0m
|
||
[38;5;12m (https://cs.stanford.edu/~matei/papers/2018/sigmod_structured_streaming.pdf) - Structured Streaming is a new high-level streaming API, it is a declarative API based on automatically incrementalizing a static relational query.[39m
|
||
|
||
[38;2;255;187;0m[4mMOOCS[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mData[0m[38;5;14m[1m [0m[38;5;14m[1mScience[0m[38;5;14m[1m [0m[38;5;14m[1mand[0m[38;5;14m[1m [0m[38;5;14m[1mEngineering[0m[38;5;14m[1m [0m[38;5;14m[1mwith[0m[38;5;14m[1m [0m[38;5;14m[1mApache[0m[38;5;14m[1m [0m[38;5;14m[1mSpark[0m[38;5;14m[1m [0m[38;5;14m[1m(edX[0m[38;5;14m[1m [0m[38;5;14m[1mXSeries)[0m[38;5;12m [39m[38;5;12m(https://www.edx.org/xseries/data-science-engineering-apache-spark)[39m[38;5;12m [39m[38;5;12m-[39m[38;5;12m [39m[38;5;12mSeries[39m[38;5;12m [39m[38;5;12mof[39m[38;5;12m [39m[38;5;12mfive[39m[38;5;12m [39m[38;5;12mcourses[39m[38;5;12m [39m[38;5;12m([39m[38;5;14m[1mIntroduction[0m[38;5;14m[1m [0m[38;5;14m[1mto[0m[38;5;14m[1m [0m[38;5;14m[1mApache[0m[38;5;14m[1m [0m[38;5;14m[1mSpark[0m[38;5;12m [39m
|
||
[38;5;12m(https://www.edx.org/course/introduction-apache-spark-uc-berkeleyx-cs105x),[39m[38;5;12m [39m[38;5;14m[1mDistributed[0m[38;5;14m[1m [0m[38;5;14m[1mMachine[0m[38;5;14m[1m [0m[38;5;14m[1mLearning[0m[38;5;14m[1m [0m[38;5;14m[1mwith[0m[38;5;14m[1m [0m[38;5;14m[1mApache[0m[38;5;14m[1m [0m[38;5;14m[1mSpark[0m[38;5;12m [39m[38;5;12m(https://www.edx.org/course/distributed-machine-learning-apache-uc-berkeleyx-cs120x),[39m[38;5;12m [39m[38;5;14m[1mBig[0m[38;5;14m[1m [0m[38;5;14m[1mData[0m[38;5;14m[1m [0m[38;5;14m[1mAnalysis[0m[38;5;14m[1m [0m[38;5;14m[1mwith[0m[38;5;14m[1m [0m[38;5;14m[1mApache[0m[38;5;14m[1m [0m[38;5;14m[1mSpark[0m[38;5;12m [39m
|
||
[38;5;12m(https://www.edx.org/course/big-data-analysis-apache-spark-uc-berkeleyx-cs110x),[39m[38;5;12m [39m[38;5;14m[1mAdvanced[0m[38;5;14m[1m [0m[38;5;14m[1mApache[0m[38;5;14m[1m [0m[38;5;14m[1mSpark[0m[38;5;14m[1m [0m[38;5;14m[1mfor[0m[38;5;14m[1m [0m[38;5;14m[1mData[0m[38;5;14m[1m [0m[38;5;14m[1mScience[0m[38;5;14m[1m [0m[38;5;14m[1mand[0m[38;5;14m[1m [0m[38;5;14m[1mData[0m[38;5;14m[1m [0m[38;5;14m[1mEngineering[0m[38;5;12m [39m[38;5;12m(https://www.edx.org/course/advanced-apache-spark-data-science-data-uc-berkeleyx-cs115x),[39m[38;5;12m [39m[38;5;14m[1mAdvanced[0m[38;5;14m[1m [0m
|
||
[38;5;14m[1mDistributed[0m[38;5;14m[1m [0m[38;5;14m[1mMachine[0m[38;5;14m[1m [0m[38;5;14m[1mLearning[0m[38;5;14m[1m [0m[38;5;14m[1mwith[0m[38;5;14m[1m [0m[38;5;14m[1mApache[0m[38;5;14m[1m [0m[38;5;14m[1mSpark[0m[38;5;12m [39m[38;5;12m(https://www.edx.org/course/advanced-distributed-machine-learning-uc-berkeleyx-cs125x))[39m[38;5;12m [39m[38;5;12mcovering[39m[38;5;12m [39m[38;5;12mdifferent[39m[38;5;12m [39m[38;5;12maspects[39m[38;5;12m [39m[38;5;12mof[39m[38;5;12m [39m[38;5;12msoftware[39m[38;5;12m [39m[38;5;12mengineering[39m[38;5;12m [39m[38;5;12mand[39m[38;5;12m [39m[38;5;12mdata[39m[38;5;12m [39m[38;5;12mscience.[39m[38;5;12m [39m[38;5;12mPython[39m[38;5;12m [39m[38;5;12moriented.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mBig Data Analysis with Scala and Spark (Coursera)[0m[38;5;12m (https://www.coursera.org/learn/big-data-analysys) - Scala oriented introductory course. Part of [39m[38;5;14m[1mFunctional Programming in Scala Specialization[0m[38;5;12m (https://www.coursera.org/specializations/scala).[39m
|
||
|
||
[38;2;255;187;0m[4mWorkshops[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mAMP[0m[38;5;14m[1m [0m[38;5;14m[1mCamp[0m[38;5;12m [39m[38;5;12m(http://ampcamp.berkeley.edu)[39m[38;5;12m [39m[38;5;12m-[39m[38;5;12m [39m[38;5;12mPeriodical[39m[38;5;12m [39m[38;5;12mtraining[39m[38;5;12m [39m[38;5;12mevent[39m[38;5;12m [39m[38;5;12morganized[39m[38;5;12m [39m[38;5;12mby[39m[38;5;12m [39m[38;5;12mthe[39m[38;5;12m [39m[38;5;14m[1mUC[0m[38;5;14m[1m [0m[38;5;14m[1mBerkeley[0m[38;5;14m[1m [0m[38;5;14m[1mAMPLab[0m[38;5;12m [39m[38;5;12m(https://amplab.cs.berkeley.edu/).[39m[38;5;12m [39m[38;5;12mA[39m[38;5;12m [39m[38;5;12msource[39m[38;5;12m [39m[38;5;12mof[39m[38;5;12m [39m[38;5;12museful[39m[38;5;12m [39m[38;5;12mexercise[39m[38;5;12m [39m[38;5;12mand[39m[38;5;12m [39m[38;5;12mrecorded[39m[38;5;12m [39m[38;5;12mworkshops[39m[38;5;12m [39m[38;5;12mcovering[39m[38;5;12m [39m[38;5;12mdifferent[39m[38;5;12m [39m[38;5;12mtools[39m[38;5;12m [39m[38;5;12mfrom[39m[38;5;12m [39m[38;5;12mthe[39m[38;5;12m [39m[38;5;14m[1mBerkeley[0m[38;5;14m[1m [0m[38;5;14m[1mData[0m[38;5;14m[1m [0m
|
||
[38;5;14m[1mAnalytics[0m[38;5;14m[1m [0m[38;5;14m[1mStack[0m[38;5;12m [39m[38;5;12m(https://amplab.cs.berkeley.edu/software/).[39m
|
||
|
||
[38;2;255;187;0m[4mProjects Using Spark[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mOryx 2[0m[38;5;12m (https://github.com/OryxProject/oryx) - [39m[38;5;14m[1mLambda architecture[0m[38;5;12m (http://lambda-architecture.net/) platform built on Apache Spark and [39m[38;5;14m[1mApache Kafka[0m[38;5;12m (http://kafka.apache.org/) with specialization for real-time large scale machine learning.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mPhoton ML[0m[38;5;12m (https://github.com/linkedin/photon-ml) - A machine learning library supporting classical Generalized Mixed Model and Generalized Additive Mixed Effect Model.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mPredictionIO[0m[38;5;12m (https://prediction.io/) - Machine Learning server for developers and data scientists to build and deploy predictive applications in a fraction of the time.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mCrossdata[0m[38;5;12m (https://github.com/Stratio/Crossdata) - Data integration platform with extended DataSource API and multi-user environment.[39m
|
||
|
||
|
||
[38;2;255;187;0m[4mDocker Images[0m
|
||
|
||
[38;5;12m- [39m[38;5;14m[1mapache/spark[0m[38;5;12m (https://hub.docker.com/r/apache/spark) - Apache Spark Official Docker images.[39m
|
||
[38;5;12m- [39m[38;5;14m[1mjupyter/docker-stacks/pyspark-notebook[0m[38;5;12m (https://github.com/jupyter/docker-stacks/tree/master/pyspark-notebook) - PySpark with Jupyter Notebook and Mesos client.[39m
|
||
[38;5;12m- [39m[38;5;14m[1msequenceiq/docker-spark[0m[38;5;12m (https://github.com/sequenceiq/docker-spark) - Yarn images from [39m[38;5;14m[1mSequenceIQ[0m[38;5;12m (http://www.sequenceiq.com/).[39m
|
||
[38;5;12m- [39m[38;5;14m[1mdatamechanics/spark[0m[38;5;12m (https://hub.docker.com/r/datamechanics/spark) - An easy to setup Docker image for Apache Spark from [39m[38;5;14m[1mData Mechanics[0m[38;5;12m (https://www.datamechanics.co/).[39m
|
||
|
||
[38;2;255;187;0m[4mMiscellaneous[0m
|
||
|
||
[38;5;12m- [39m[38;5;14m[1mSpark with Scala Gitter channel[0m[38;5;12m (https://gitter.im/spark-scala/Lobby) - "_A place to discuss and ask questions about using Scala for Spark programming_" started by [39m[38;5;14m[1m@deanwampler[0m[38;5;12m (https://github.com/deanwampler).[39m
|
||
[38;5;12m-[39m[38;5;12m [39m[38;5;14m[1mApache[0m[38;5;14m[1m [0m[38;5;14m[1mSpark[0m[38;5;14m[1m [0m[38;5;14m[1mUser[0m[38;5;14m[1m [0m[38;5;14m[1mList[0m[38;5;12m [39m[38;5;12m(http://apache-spark-user-list.1001560.n3.nabble.com/)[39m[38;5;12m [39m[38;5;12mand[39m[38;5;12m [39m[38;5;14m[1mApache[0m[38;5;14m[1m [0m[38;5;14m[1mSpark[0m[38;5;14m[1m [0m[38;5;14m[1mDevelopers[0m[38;5;14m[1m [0m[38;5;14m[1mList[0m[38;5;12m [39m[38;5;12m(http://apache-spark-developers-list.1001551.n3.nabble.com/)[39m[38;5;12m [39m[38;5;12m-[39m[38;5;12m [39m[38;5;12mMailing[39m[38;5;12m [39m[38;5;12mlists[39m[38;5;12m [39m[38;5;12mdedicated[39m[38;5;12m [39m[38;5;12mto[39m[38;5;12m [39m[38;5;12musage[39m[38;5;12m [39m[38;5;12mquestions[39m[38;5;12m [39m[38;5;12mand[39m[38;5;12m [39m[38;5;12mdevelopment[39m[38;5;12m [39m[38;5;12mtopics[39m[38;5;12m [39m
|
||
[38;5;12mrespectively.[39m
|
||
|
||
[38;2;255;187;0m[4mReferences[0m
|
||
|
||
|
||
|
||
[38;2;255;187;0m[4mLicense[0m
|
||
|
||
|
||
|
||
[38;5;12m<img src="https://mirrors.creativecommons.org/presskit/buttons/88x31/svg/publicdomain.svg"[39m
|
||
[48;5;235m[38;5;249m style="border-style: none;" alt="Public Domain Mark" />[49m[39m
|
||
|
||
|
||
[38;5;12mThis work (, is free of known copyright restrictions.[39m
|
||
|
||
|
||
[38;5;12mApache Spark, Spark, Apache, and the Spark logo are of[39m
|
||
[38;5;12m . This compilation is not endorsed by The Apache Software Foundation.[39m
|
||
|
||
[38;5;12mInspired by [39m[38;5;14m[1msindresorhus/awesome[0m[38;5;12m (https://github.com/sindresorhus/awesome).[39m
|
||
|
||
[38;5;12mspark Github: https://github.com/awesome-spark/awesome-spark[39m
|