288 lines
68 KiB
Plaintext
288 lines
68 KiB
Plaintext
|
||
[38;5;12m [39m[38;2;255;187;0m[1m[4mAwesome Web Archiving [0m[38;5;14m[1m[4m![0m[38;2;255;187;0m[1m[4mAwesome[0m[38;5;14m[1m[4m (https://awesome.re/badge.svg)[0m[38;2;255;187;0m[1m[4m (https://awesome.re)[0m
|
||
|
||
[38;5;12mWeb[39m[38;5;12m [39m[38;5;12marchiving[39m[38;5;12m [39m[38;5;12mis[39m[38;5;12m [39m[38;5;12mthe[39m[38;5;12m [39m[38;5;12mprocess[39m[38;5;12m [39m[38;5;12mof[39m[38;5;12m [39m[38;5;12mcollecting[39m[38;5;12m [39m[38;5;12mportions[39m[38;5;12m [39m[38;5;12mof[39m[38;5;12m [39m[38;5;12mthe[39m[38;5;12m [39m[38;5;12mWorld[39m[38;5;12m [39m[38;5;12mWide[39m[38;5;12m [39m[38;5;12mWeb[39m[38;5;12m [39m[38;5;12mto[39m[38;5;12m [39m[38;5;12mensure[39m[38;5;12m [39m[38;5;12mthe[39m[38;5;12m [39m[38;5;12minformation[39m[38;5;12m [39m[38;5;12mis[39m[38;5;12m [39m[38;5;12mpreserved[39m[38;5;12m [39m[38;5;12min[39m[38;5;12m [39m[38;5;12man[39m[38;5;12m [39m[38;5;12marchive[39m[38;5;12m [39m[38;5;12mfor[39m[38;5;12m [39m[38;5;12mfuture[39m[38;5;12m [39m[38;5;12mresearchers,[39m[38;5;12m [39m[38;5;12mhistorians,[39m[38;5;12m [39m[38;5;12mand[39m[38;5;12m [39m[38;5;12mthe[39m[38;5;12m [39m[38;5;12mpublic.[39m[38;5;12m [39m[38;5;12mWeb[39m[38;5;12m [39m[38;5;12marchivists[39m[38;5;12m [39m[38;5;12mtypically[39m[38;5;12m [39m[38;5;12memploy[39m[38;5;12m [39m[38;5;12mWeb[39m[38;5;12m [39m[38;5;12mcrawlers[39m[38;5;12m [39m[38;5;12mfor[39m[38;5;12m [39m[38;5;12mautomated[39m[38;5;12m [39m[38;5;12mcapture[39m[38;5;12m [39m
|
||
[38;5;12mdue[39m[38;5;12m [39m[38;5;12mto[39m[38;5;12m [39m[38;5;12mthe[39m[38;5;12m [39m[38;5;12mmassive[39m[38;5;12m [39m[38;5;12mscale[39m[38;5;12m [39m[38;5;12mof[39m[38;5;12m [39m[38;5;12mthe[39m[38;5;12m [39m[38;5;12mWeb.[39m[38;5;12m [39m[38;5;12mEver-evolving[39m[38;5;12m [39m[38;5;12mWeb[39m[38;5;12m [39m[38;5;12mstandards[39m[38;5;12m [39m[38;5;12mrequire[39m[38;5;12m [39m[38;5;12mcontinuous[39m[38;5;12m [39m[38;5;12mevolution[39m[38;5;12m [39m[38;5;12mof[39m[38;5;12m [39m[38;5;12marchiving[39m[38;5;12m [39m[38;5;12mtools[39m[38;5;12m [39m[38;5;12mto[39m[38;5;12m [39m[38;5;12mkeep[39m[38;5;12m [39m[38;5;12mup[39m[38;5;12m [39m[38;5;12mwith[39m[38;5;12m [39m[38;5;12mthe[39m[38;5;12m [39m[38;5;12mchanges[39m[38;5;12m [39m[38;5;12min[39m[38;5;12m [39m[38;5;12mWeb[39m[38;5;12m [39m[38;5;12mtechnologies[39m[38;5;12m [39m[38;5;12mto[39m[38;5;12m [39m[38;5;12mensure[39m[38;5;12m [39m[38;5;12mreliable[39m[38;5;12m [39m[38;5;12mand[39m[38;5;12m [39m[38;5;12mmeaningful[39m[38;5;12m [39m[38;5;12mcapture[39m[38;5;12m [39m[38;5;12mand[39m[38;5;12m [39m[38;5;12mreplay[39m[38;5;12m [39m[38;5;12mof[39m[38;5;12m [39m[38;5;12marchived[39m[38;5;12m [39m[38;5;12mweb[39m[38;5;12m [39m[38;5;12mpages.[39m
|
||
|
||
|
||
[38;2;255;187;0m[4mContents[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mTraining/Documentation[0m[38;5;12m (#trainingdocumentation)[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mResources for Web Publishers[0m[38;5;12m (#resources-for-web-publishers)[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mTools & Software[0m[38;5;12m (#tools--software)[39m
|
||
[38;5;12m [39m[38;5;12m [39m[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mAcquisition[0m[38;5;12m (#acquisition)[39m
|
||
[38;5;12m [39m[38;5;12m [39m[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mReplay[0m[38;5;12m (#replay)[39m
|
||
[38;5;12m [39m[38;5;12m [39m[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mSearch & Discovery[0m[38;5;12m (#search--discovery)[39m
|
||
[38;5;12m [39m[38;5;12m [39m[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mUtilities[0m[38;5;12m (#utilities)[39m
|
||
[38;5;12m [39m[38;5;12m [39m[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mWARC I/O Libraries[0m[38;5;12m (#warc-io-libraries)[39m
|
||
[38;5;12m [39m[38;5;12m [39m[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mAnalysis[0m[38;5;12m (#analysis)[39m
|
||
[38;5;12m [39m[38;5;12m [39m[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mQuality Assurance[0m[38;5;12m (#quality-assurance)[39m
|
||
[38;5;12m [39m[38;5;12m [39m[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mCuration[0m[38;5;12m (#curation)[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mCommunity Resources[0m[38;5;12m (#community-resources)[39m
|
||
[38;5;12m [39m[38;5;12m [39m[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mOther Awesome Lists[0m[38;5;12m (#other-awesome-lists)[39m
|
||
[38;5;12m [39m[38;5;12m [39m[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mBlogs and Scholarship[0m[38;5;12m (#blogs-and-scholarship)[39m
|
||
[38;5;12m [39m[38;5;12m [39m[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mMailing Lists[0m[38;5;12m (#mailing-lists)[39m
|
||
[38;5;12m [39m[38;5;12m [39m[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mSlack[0m[38;5;12m (#slack)[39m
|
||
[38;5;12m [39m[38;5;12m [39m[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mTwitter[0m[38;5;12m (#twitter)[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mWeb Archiving Service Providers[0m[38;5;12m (#web-archiving-service-providers)[39m
|
||
[38;5;12m [39m[38;5;12m [39m[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mSelf-hostable, Open Source[0m[38;5;12m (#self-hostable-open-source)[39m
|
||
[38;5;12m [39m[38;5;12m [39m[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mHosted, Closed Source[0m[38;5;12m (#hosted-closed-source)[39m
|
||
|
||
[38;2;255;187;0m[4mTraining/Documentation[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;12mIntroductions to web archiving concepts:[39m
|
||
[38;5;12m [39m[38;5;12m [39m[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mWhat is a web archive?[0m[38;5;12m (https://youtu.be/ubDHY-ynWi0) - A video from [39m[38;5;14m[1mthe UK Web Archive YouTube Channel[0m[38;5;12m (https://www.youtube.com/channel/UCJukhTSw8VRj-VNTpBcqWkw)[39m
|
||
[38;5;12m [39m[38;5;12m [39m[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mWikipedia's List of Web Archiving Initiatives[0m[38;5;12m (https://en.wikipedia.org/wiki/List_of_Web_archiving_initiatives)[39m
|
||
[38;5;12m [39m[38;5;12m [39m[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mGlossary of Archive-It and Web Archiving Terms[0m[38;5;12m (https://support.archive-it.org/hc/en-us/articles/208111686-Glossary-of-Archive-It-and-Web-Archiving-Terms)[39m
|
||
[38;5;12m [39m[38;5;12m [39m[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mThe[0m[38;5;14m[1m [0m[38;5;14m[1mWeb[0m[38;5;14m[1m [0m[38;5;14m[1mArchiving[0m[38;5;14m[1m [0m[38;5;14m[1mLifecycle[0m[38;5;14m[1m [0m[38;5;14m[1mModel[0m[38;5;12m [39m[38;5;12m(https://archive-it.org/blog/post/announcing-the-web-archiving-life-cycle-model/)[39m[38;5;12m [39m[38;5;12m-[39m[38;5;12m [39m[38;5;12mThe[39m[38;5;12m [39m[38;5;12mWeb[39m[38;5;12m [39m[38;5;12mArchiving[39m[38;5;12m [39m[38;5;12mLifecycle[39m[38;5;12m [39m[38;5;12mModel[39m[38;5;12m [39m[38;5;12mis[39m[38;5;12m [39m[38;5;12man[39m[38;5;12m [39m[38;5;12mattempt[39m[38;5;12m [39m[38;5;12mto[39m[38;5;12m [39m[38;5;12mincorporate[39m[38;5;12m [39m[38;5;12mthe[39m[38;5;12m [39m[38;5;12mtechnological[39m[38;5;12m [39m[38;5;12mand[39m[38;5;12m [39m[38;5;12mprogrammatic[39m[38;5;12m [39m[38;5;12marms[39m[38;5;12m [39m[38;5;12mof[39m[38;5;12m [39m[38;5;12mthe[39m[38;5;12m [39m[38;5;12mweb[39m[38;5;12m [39m[38;5;12marchiving[39m[38;5;12m [39m
|
||
[38;5;12minto[39m[38;5;12m [39m[38;5;12ma[39m[38;5;12m [39m[38;5;12mframework[39m[38;5;12m [39m[38;5;12mthat[39m[38;5;12m [39m[38;5;12mwill[39m[38;5;12m [39m[38;5;12mbe[39m[38;5;12m [39m[38;5;12mrelevant[39m[38;5;12m [39m[38;5;12mto[39m[38;5;12m [39m[38;5;12many[39m[38;5;12m [39m[38;5;12morganization[39m[38;5;12m [39m[38;5;12mseeking[39m[38;5;12m [39m[38;5;12mto[39m[38;5;12m [39m[38;5;12marchive[39m[38;5;12m [39m[38;5;12mcontent[39m[38;5;12m [39m[38;5;12mfrom[39m[38;5;12m [39m[38;5;12mthe[39m[38;5;12m [39m[38;5;12mweb.[39m[38;5;12m [39m[38;5;12mArchive-It,[39m[38;5;12m [39m[38;5;12mthe[39m[38;5;12m [39m[38;5;12mweb[39m[38;5;12m [39m[38;5;12marchiving[39m[38;5;12m [39m[38;5;12mservice[39m[38;5;12m [39m[38;5;12mfrom[39m[38;5;12m [39m[38;5;12mthe[39m[38;5;12m [39m[38;5;12mInternet[39m[38;5;12m [39m[38;5;12mArchive,[39m[38;5;12m [39m[38;5;12mdeveloped[39m[38;5;12m [39m[38;5;12mthe[39m[38;5;12m [39m[38;5;12mmodel[39m[38;5;12m [39m[38;5;12mbased[39m[38;5;12m [39m[38;5;12mon[39m[38;5;12m [39m[38;5;12mits[39m[38;5;12m [39m[38;5;12mwork[39m[38;5;12m [39m[38;5;12mwith[39m[38;5;12m [39m[38;5;12mmemory[39m[38;5;12m [39m[38;5;12minstitutions[39m[38;5;12m [39m[38;5;12maround[39m[38;5;12m [39m[38;5;12mthe[39m[38;5;12m [39m[38;5;12mworld.[39m
|
||
[38;5;12m [39m[38;5;12m [39m[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mRetrieving and Archiving Information from Websites by Wael Eskandar and Brad Murray[0m[38;5;12m (https://kit.exposingtheinvisible.org/en/web-archive.html/)[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;12mTraining materials:[39m
|
||
[38;5;12m [39m[38;5;12m [39m[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mIIPC and DPC Training materials: module for beginners (8 sessions)[0m[38;5;12m (https://netpreserve.org/web-archiving/training-materials/)[39m
|
||
[38;5;12m [39m[38;5;12m [39m[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mUNT Web Archiving Course[0m[38;5;12m (https://github.com/vphill/web-archiving-course)[39m
|
||
[38;5;12m [39m[38;5;12m [39m[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mContinuing Education to Advance Web Archiving (CEDWARC)[0m[38;5;12m (https://cedwarc.github.io/)[39m
|
||
[38;5;12m [39m[38;5;12m [39m[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mA Whirlwind Tour of Common Crawl's Datasets using Python[0m[38;5;12m (https://github.com/commoncrawl/whirlwind-python/)[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;12mThe WARC Standard:[39m
|
||
[38;5;12m [39m[38;5;12m [39m[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;12mThe [39m[38;5;14m[1mwarc-specifications[0m[38;5;12m (https://iipc.github.io/warc-specifications/) community HTML version of the official specification and hub for new proposals.[39m
|
||
[38;5;12m [39m[38;5;12m [39m[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;12mThe [39m[38;5;14m[1moffical ISO 28500 WARC specification homepage[0m[38;5;12m (http://bibnum.bnf.fr/WARC/).[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;12mFor researchers using web archives:[39m
|
||
[38;5;12m [39m[38;5;12m [39m[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mGLAM Workbench: Web Archives[0m[38;5;12m (https://glam-workbench.github.io/web-archives/) - See also [39m[38;5;14m[1mthis related blog post on 'Asking questions with web archives'[0m[38;5;12m (https://netpreserveblog.wordpress.com/2020/05/28/asking-questions-with-web-archives/).[39m
|
||
[38;5;12m [39m[38;5;12m [39m[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mArchives Unleashed Toolkit documentation[0m[38;5;12m (https://aut.docs.archivesunleashed.org/)[39m
|
||
[38;5;12m [39m[38;5;12m [39m[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mTutorial for Humanities researchers about how to explore Arquivo.pt[0m[38;5;12m (https://sobre.arquivo.pt/en/tutorial-for-humanities-researchers-about-how-to-use-arquivo-pt/)[39m
|
||
|
||
[38;2;255;187;0m[4mResources for Web Publishers[0m
|
||
|
||
[38;5;12mThese resources can help when working with individuals or organisations who publish on the web, and who want to make sure their site can be archived.[39m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mDefinition[0m[38;5;14m[1m [0m[38;5;14m[1mof[0m[38;5;14m[1m [0m[38;5;14m[1mWeb[0m[38;5;14m[1m [0m[38;5;14m[1mArchivability[0m[38;5;12m [39m[38;5;12m(https://nullhandle.org/web-archivability/index.html)[39m[38;5;12m [39m[38;5;12m-[39m[38;5;12m [39m[38;5;12mThis[39m[38;5;12m [39m[38;5;12mdescribes[39m[38;5;12m [39m[38;5;12mthe[39m[38;5;12m [39m[38;5;12mease[39m[38;5;12m [39m[38;5;12mwith[39m[38;5;12m [39m[38;5;12mwhich[39m[38;5;12m [39m[38;5;12mweb[39m[38;5;12m [39m[38;5;12mcontent[39m[38;5;12m [39m[38;5;12mcan[39m[38;5;12m [39m[38;5;12mbe[39m[38;5;12m [39m[38;5;12mpreserved.[39m[38;5;12m [39m[38;5;12m([39m[38;5;14m[1mArchived[0m[38;5;14m[1m [0m[38;5;14m[1mversion[0m[38;5;14m[1m [0m[38;5;14m[1mfrom[0m[38;5;14m[1m [0m[38;5;14m[1mthe[0m[38;5;14m[1m [0m[38;5;14m[1mStanford[0m[38;5;14m[1m [0m[38;5;14m[1mLibraries[0m[38;5;12m [39m
|
||
[38;5;12m(https://web.archive.org/web/20230728211501/https://library.stanford.edu/projects/web-archiving/archivability))[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;12mThe [39m[38;5;14m[1mArchive Ready[0m[38;5;12m (http://archiveready.com/) tool, for estimating how likely a web page will be archived successfully.[39m
|
||
|
||
|
||
[38;2;255;187;0m[4mTools & Software[0m
|
||
|
||
[38;5;12mThis[39m[38;5;12m [39m[38;5;12mlist[39m[38;5;12m [39m[38;5;12mof[39m[38;5;12m [39m[38;5;12mtools[39m[38;5;12m [39m[38;5;12mand[39m[38;5;12m [39m[38;5;12msoftware[39m[38;5;12m [39m[38;5;12mis[39m[38;5;12m [39m[38;5;12mintended[39m[38;5;12m [39m[38;5;12mto[39m[38;5;12m [39m[38;5;12mbriefly[39m[38;5;12m [39m[38;5;12mdescribe[39m[38;5;12m [39m[38;5;12msome[39m[38;5;12m [39m[38;5;12mof[39m[38;5;12m [39m[38;5;12mthe[39m[38;5;12m [39m[38;5;12mmost[39m[38;5;12m [39m[38;5;12mimportant[39m[38;5;12m [39m[38;5;12mand[39m[38;5;12m [39m[38;5;12mwidely-used[39m[38;5;12m [39m[38;5;12mtools[39m[38;5;12m [39m[38;5;12mrelated[39m[38;5;12m [39m[38;5;12mto[39m[38;5;12m [39m[38;5;12mweb[39m[38;5;12m [39m[38;5;12marchiving.[39m[38;5;12m [39m[38;5;12mFor[39m[38;5;12m [39m[38;5;12mmore[39m[38;5;12m [39m[38;5;12mdetails,[39m[38;5;12m [39m[38;5;12mwe[39m[38;5;12m [39m[38;5;12mrecommend[39m[38;5;12m [39m[38;5;12myou[39m[38;5;12m [39m[38;5;12mrefer[39m[38;5;12m [39m[38;5;12mto[39m[38;5;12m [39m[38;5;12m(and[39m[38;5;12m [39m[38;5;12mcontribute[39m[38;5;12m [39m[38;5;12mto!)[39m[38;5;12m [39m[38;5;12mthese[39m[38;5;12m [39m[38;5;12mexcellent[39m[38;5;12m [39m[38;5;12mresources[39m[38;5;12m [39m[38;5;12mfrom[39m[38;5;12m [39m[38;5;12mother[39m[38;5;12m [39m
|
||
[38;5;12mgroups:[39m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mComparison of web archiving software[0m[38;5;12m (https://github.com/archivers-space/research/tree/master/web_archiving)[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mAwesome Website Change Monitoring[0m[38;5;12m (https://github.com/edgi-govdata-archiving/awesome-website-change-monitoring)[39m
|
||
|
||
[38;2;255;187;0m[4mAcquisition[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mArchiveBox[0m[38;5;12m (https://github.com/pirate/ArchiveBox) - A tool which maintains an additive archive from RSS feeds, bookmarks, and links using wget, Chrome headless, and other methods (formerly [39m[48;5;235m[38;5;249mBookmark Archiver[49m[39m[38;5;12m). [39m[48;2;30;30;40m[38;5;13m[3m(In Development)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1marchivenow[0m[38;5;12m (https://github.com/oduwsdl/archivenow) - A [39m[38;5;14m[1mPython library[0m[38;5;12m (http://ws-dl.blogspot.com/2017/02/2017-02-22-archive-now-archivenow.html) to push web resources into on-demand web archives. [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mArchiveWeb.Page[0m[38;5;12m [39m[38;5;12m(https://webrecorder.net/archivewebpage/)[39m[38;5;12m [39m[38;5;12m-[39m[38;5;12m [39m[38;5;12mA[39m[38;5;12m [39m[38;5;12mplugin[39m[38;5;12m [39m[38;5;12mfor[39m[38;5;12m [39m[38;5;12mChrome[39m[38;5;12m [39m[38;5;12mand[39m[38;5;12m [39m[38;5;12mother[39m[38;5;12m [39m[38;5;12mChromium[39m[38;5;12m [39m[38;5;12mbased[39m[38;5;12m [39m[38;5;12mbrowsers[39m[38;5;12m [39m[38;5;12mthat[39m[38;5;12m [39m[38;5;12mlets[39m[38;5;12m [39m[38;5;12myou[39m[38;5;12m [39m[38;5;12minteractively[39m[38;5;12m [39m[38;5;12marchive[39m[38;5;12m [39m[38;5;12mweb[39m[38;5;12m [39m[38;5;12mpages,[39m[38;5;12m [39m[38;5;12mreplay[39m[38;5;12m [39m[38;5;12mthem,[39m[38;5;12m [39m[38;5;12mand[39m[38;5;12m [39m[38;5;12mexport[39m[38;5;12m [39m[38;5;12mthem[39m[38;5;12m [39m[38;5;12mas[39m[38;5;12m [39m[38;5;12mWARC[39m[38;5;12m [39m[38;5;12m&[39m[38;5;12m [39m[38;5;12mWACZ[39m[38;5;12m [39m[38;5;12mfiles.[39m[38;5;12m [39m[38;5;12mAlso[39m[38;5;12m [39m[38;5;12mavailable[39m[38;5;12m [39m[38;5;12mas[39m[38;5;12m [39m[38;5;12man[39m[38;5;12m [39m[38;5;12mElectron[39m[38;5;12m [39m
|
||
[38;5;12mbased[39m[38;5;12m [39m[38;5;12mdesktop[39m[38;5;12m [39m[38;5;12mapplication.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mAuto[0m[38;5;14m[1m [0m[38;5;14m[1mArchiver[0m[38;5;12m [39m[38;5;12m(https://github.com/bellingcat/auto-archiver)[39m[38;5;12m [39m[38;5;12m-[39m[38;5;12m [39m[38;5;12mPython[39m[38;5;12m [39m[38;5;12mscript[39m[38;5;12m [39m[38;5;12mto[39m[38;5;12m [39m[38;5;12mautomatically[39m[38;5;12m [39m[38;5;12marchive[39m[38;5;12m [39m[38;5;12msocial[39m[38;5;12m [39m[38;5;12mmedia[39m[38;5;12m [39m[38;5;12mposts,[39m[38;5;12m [39m[38;5;12mvideos,[39m[38;5;12m [39m[38;5;12mand[39m[38;5;12m [39m[38;5;12mimages[39m[38;5;12m [39m[38;5;12mfrom[39m[38;5;12m [39m[38;5;12ma[39m[38;5;12m [39m[38;5;12mGoogle[39m[38;5;12m [39m[38;5;12mSheets[39m[38;5;12m [39m[38;5;12mdocument.[39m[38;5;12m [39m[38;5;12mRead[39m[38;5;12m [39m[38;5;12mthe[39m[38;5;12m [39m[38;5;14m[1marticle[0m[38;5;14m[1m [0m[38;5;14m[1mabout[0m[38;5;14m[1m [0m[38;5;14m[1mAuto[0m[38;5;14m[1m [0m[38;5;14m[1mArchiver[0m[38;5;14m[1m [0m[38;5;14m[1mon[0m[38;5;14m[1m [0m[38;5;14m[1mbellingcat.com[0m[38;5;12m [39m
|
||
[38;5;12m(https://www.bellingcat.com/resources/2022/09/22/preserve-vital-online-content-with-bellingcats-auto-archiver-tool/).[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mBrowsertrix Crawler[0m[38;5;12m (https://github.com/webrecorder/browsertrix-crawler) - A Chromium based high-fidelity crawling system, designed to run a complex, customizable browser-based crawl in a single Docker container. [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mBrozzler[0m[38;5;12m (https://github.com/internetarchive/brozzler) - A distributed web crawler (爬虫) that uses a real browser (Chrome or Chromium) to fetch pages and embedded urls and to extract links. [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mCairn[0m[38;5;12m (https://github.com/wabarc/cairn) - A npm package and CLI tool for saving webpages. [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mChronicler[0m[38;5;12m (https://github.com/CGamesPlay/chronicler) - Web browser with record and replay functionality. [39m[48;2;30;30;40m[38;5;13m[3m(In Development)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mCommunity Archive[0m[38;5;12m (https://www.community-archive.org/) - Open Twitter Database and API with tools and resources for building on archived Twitter data.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mcrau[0m[38;5;12m (https://github.com/turicas/crau) - crau is the way (most) Brazilians pronounce crawl, it's the easiest command-line tool for archiving the Web and playing archives: you just need a list of URLs. [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mCrawl[0m[38;5;12m (https://git.autistici.org/ale/crawl) - A simple web crawler in Golang. [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mcrocoite[0m[38;5;12m (https://github.com/promyloph/crocoite) - Crawl websites using headless Google Chrome/Chromium and save resources, static DOM snapshot and page screenshots to WARC files. [39m[48;2;30;30;40m[38;5;13m[3m(In Development)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mDiskerNet[0m[38;5;12m (https://github.com/dosyago/DiskerNet) - A non-WARC-based tool which hooks into the Chrome browser and archives everything you browse making it available for offline replay. [39m[48;2;30;30;40m[38;5;13m[3m(In Development)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mF(b)arc[0m[38;5;12m (https://github.com/justinlittman/fbarc) - A commandline tool and Python library for archiving data from [39m[38;5;14m[1mFacebook[0m[38;5;12m (https://www.facebook.com/) using the [39m[38;5;14m[1mGraph API[0m[38;5;12m (https://developers.facebook.com/docs/graph-api). [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mfreeze-dry[0m[38;5;12m (https://github.com/WebMemex/freeze-dry) - JavaScript library to turn page into static, self-contained HTML document; useful for browser extensions. [39m[48;2;30;30;40m[38;5;13m[3m(In Development)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mgrab-site[0m[38;5;12m (https://github.com/ArchiveTeam/grab-site) - The archivist's web crawler: WARC output, dashboard for all crawls, dynamic ignore patterns. [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mHeritrix[0m[38;5;12m (https://github.com/internetarchive/heritrix3/wiki) - An open source, extensible, web-scale, archival quality web crawler. [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[38;5;12m [39m[38;5;12m [39m[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mHeritrix Q&A[0m[38;5;12m (https://github.com/internetarchive/heritrix3/discussions/categories/q-a) - A discussion forum for asking questions and getting answers about using Heritrix.[39m
|
||
[38;5;12m [39m[38;5;12m [39m[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mHeritrix Walkthrough[0m[38;5;12m (https://github.com/web-archive-group/heritrix-walkthrough) [39m[48;2;30;30;40m[38;5;13m[3m(In Development)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mhtml2warc[0m[38;5;12m (https://github.com/steffenfritz/html2warc) - A simple script to convert offline data into a single WARC file. [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mHTTrack[0m[38;5;12m (http://www.httrack.com/) - An open source website copying utility. [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mmonolith[0m[38;5;12m (https://github.com/Y2Z/monolith) - CLI tool to save a web page as a single HTML file. [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mObelisk[0m[38;5;12m (https://github.com/go-shiori/obelisk) - Go package and CLI tool for saving web page as single HTML file. [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mScoop[0m[38;5;12m (https://github.com/harvard-lil/scoop) - High-fidelity, browser-based, single-page web archiving library and CLI for witnessing the web. [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mSingleFile[0m[38;5;12m (https://github.com/gildas-lormeau/SingleFile) - Browser extension for Firefox/Chrome and CLI tool to save a faithful copy of a complete page as a single HTML file. [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mSiteStory[0m[38;5;12m (http://mementoweb.github.io/SiteStory/) - A transactional archive that selectively captures and stores transactions that take place between a web client (browser) and a web server. [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mSocial Feed Manager[0m[38;5;12m (https://gwu-libraries.github.io/sfm-ui/) - Open source software that enables users to create social media collections from Twitter, Tumblr, Flickr, and Sina Weibo public APIs. [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mSquidwarc[0m[38;5;12m [39m[38;5;12m(https://github.com/N0taN3rd/Squidwarc)[39m[38;5;12m [39m[38;5;12m-[39m[38;5;12m [39m[38;5;12mAn[39m[38;5;12m [39m[38;5;14m[1mopen[0m[38;5;14m[1m [0m[38;5;14m[1msource,[0m[38;5;14m[1m [0m[38;5;14m[1mhigh-fidelity,[0m[38;5;14m[1m [0m[38;5;14m[1mpage[0m[38;5;14m[1m [0m[38;5;14m[1minteracting[0m[38;5;12m [39m[38;5;12m(http://ws-dl.blogspot.com/2017/07/2017-07-24-replacing-heritrix-with.html)[39m[38;5;12m [39m[38;5;12marchival[39m[38;5;12m [39m[38;5;12mcrawler[39m[38;5;12m [39m[38;5;12mthat[39m[38;5;12m [39m[38;5;12muses[39m[38;5;12m [39m[38;5;12mChrome[39m[38;5;12m [39m[38;5;12mor[39m[38;5;12m [39m[38;5;12mChrome[39m[38;5;12m [39m[38;5;12mHeadless[39m[38;5;12m [39m[38;5;12mdirectly.[39m[38;5;12m [39m[48;2;30;30;40m[38;5;13m[3m(In[0m[48;2;30;30;40m[38;5;13m[3m [0m
|
||
[48;2;30;30;40m[38;5;13m[3mDevelopment)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mStormCrawler[0m[38;5;12m (http://stormcrawler.net/) - A collection of resources for building low-latency, scalable web crawlers on Apache Storm. [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mtwarc[0m[38;5;12m (https://github.com/docnow/twarc) - A command line tool and Python library for archiving Twitter JSON data. [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mWAIL[0m[38;5;12m [39m[38;5;12m(https://github.com/machawk1/wail)[39m[38;5;12m [39m[38;5;12m-[39m[38;5;12m [39m[38;5;12mA[39m[38;5;12m [39m[38;5;12mgraphical[39m[38;5;12m [39m[38;5;12muser[39m[38;5;12m [39m[38;5;12minterface[39m[38;5;12m [39m[38;5;12m(GUI)[39m[38;5;12m [39m[38;5;12matop[39m[38;5;12m [39m[38;5;12mmultiple[39m[38;5;12m [39m[38;5;12mweb[39m[38;5;12m [39m[38;5;12marchiving[39m[38;5;12m [39m[38;5;12mtools[39m[38;5;12m [39m[38;5;12mintended[39m[38;5;12m [39m[38;5;12mto[39m[38;5;12m [39m[38;5;12mbe[39m[38;5;12m [39m[38;5;12mused[39m[38;5;12m [39m[38;5;12mas[39m[38;5;12m [39m[38;5;12man[39m[38;5;12m [39m[38;5;12measy[39m[38;5;12m [39m[38;5;12mway[39m[38;5;12m [39m[38;5;12mfor[39m[38;5;12m [39m[38;5;12manyone[39m[38;5;12m [39m[38;5;12mto[39m[38;5;12m [39m[38;5;12mpreserve[39m[38;5;12m [39m[38;5;12mand[39m[38;5;12m [39m[38;5;12mreplay[39m[38;5;12m [39m[38;5;12mweb[39m[38;5;12m [39m[38;5;12mpages;[39m[38;5;12m [39m[38;5;14m[1mPython[0m[38;5;12m [39m[38;5;12m(https://machawk1.github.io/wail/),[39m[38;5;12m [39m[38;5;14m[1mElectron[0m[38;5;12m [39m
|
||
[38;5;12m(https://github.com/n0tan3rd/wail).[39m[38;5;12m [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mWarcprox[0m[38;5;12m (https://github.com/internetarchive/warcprox) - WARC-writing MITM HTTP/S proxy. [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mWARCreate[0m[38;5;12m (http://matkelly.com/warcreate/) - A [39m[38;5;14m[1mGoogle Chrome[0m[38;5;12m (https://www.google.com/intl/en/chrome/browser/) extension for archiving an individual webpage or website to a WARC file. [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mWarcworker[0m[38;5;12m (https://github.com/peterk/warcworker) - An open source, dockerized, queued, high fidelity web archiver based on Squidwarc with a simple web GUI. [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mWayback[0m[38;5;12m (https://github.com/wabarc/wayback) - A toolkit for snapshot webpage to Internet Archive, archive.today, IPFS and beyond. [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mWaybackpy[0m[38;5;12m (https://github.com/akamhy/waybackpy) - Wayback Machine Save, CDX and availability API interface in Python and a command-line tool [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mWeb2Warc[0m[38;5;12m (https://github.com/helgeho/Web2Warc) - An easy-to-use and highly customizable crawler that enables anyone to create their own little Web archives (WARC/CDX). [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mWeb Curator Tool[0m[38;5;12m (https://webcuratortool.org) - Open-source workflow management for selective web archiving. [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mWebMemex[0m[38;5;12m (https://github.com/WebMemex) - Browser extension for Firefox and Chrome which lets you archive web pages you visit. [39m[48;2;30;30;40m[38;5;13m[3m(In Development)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mWget[0m[38;5;12m (http://www.gnu.org/software/wget/) - An open source file retrieval utility that of [39m[38;5;14m[1mversion 1.14 supports writing warcs[0m[38;5;12m (http://www.archiveteam.org/index.php?title=Wget_with_WARC_output). [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mWget-lua[0m[38;5;12m (https://github.com/alard/wget-lua) - Wget with Lua extension. [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mWpull[0m[38;5;12m (https://github.com/chfoo/wpull) - A Wget-compatible (or remake/clone/replacement/alternative) web downloader and crawler. [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
|
||
[38;2;255;187;0m[4mReplay[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mInterPlanetary Wayback (ipwb)[0m[38;5;12m (https://github.com/oduwsdl/ipwb) - Web Archive (WARC) indexing and replay using [39m[38;5;14m[1mIPFS[0m[38;5;12m (https://ipfs.io/).[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mOpenWayback[0m[38;5;12m (https://github.com/iipc/openwayback/) - The open source project aimed to develop Wayback Machine, the key software used by web archives worldwide to play back archived websites in the user's browser. [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mPYWB[0m[38;5;12m (https://github.com/webrecorder/pywb) - A Python 3 implementation of web archival replay tools, sometimes also known as 'Wayback Machine'. [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mReconstructive[0m[38;5;12m (https://oduwsdl.github.io/Reconstructive/) - Reconstructive is a ServiceWorker module for client-side reconstruction of composite mementos by rerouting resource requests to corresponding archived copies (JavaScript).[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mReplayWeb.page[0m[38;5;12m (https://webrecorder.net/replaywebpage/) - A browser-based, fully client-side replay engine for both local and remote WARC & WACZ files. Also available as an Electron based desktop application. [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mwarc2html[0m[38;5;12m (https://github.com/iipc/warc2html) - Converts WARC files to static HTML suitable for browsing offline or rehosting.[39m
|
||
|
||
[38;2;255;187;0m[4mSearch & Discovery[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mhyphe[0m[38;5;12m (https://github.com/medialab/hyphe) - A webcrawler built for research uses with a graphical user interface in order to build web corpuses made of lists of web actors and maps of links between them. [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mMink[0m[38;5;12m (https://github.com/machawk1/mink) - A [39m[38;5;14m[1mGoogle Chrome[0m[38;5;12m (https://www.google.com/intl/en/chrome/) extension for querying Memento aggregators while browsing and integrating live-archived web navigation. [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mPANDORÆ[0m[38;5;12m (https://github.com/Guillaume-Levrier/PANDORAE) - A desktop research software to be plugged on a Solr endpoint to query, retrieve, normalize and visually explore web archives. [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mplayback[0m[38;5;12m [39m[38;5;12m(https://github.com/wabarc/playback)[39m[38;5;12m [39m[38;5;12m-[39m[38;5;12m [39m[38;5;12mA[39m[38;5;12m [39m[38;5;12mtoolkit[39m[38;5;12m [39m[38;5;12mfor[39m[38;5;12m [39m[38;5;12msearching[39m[38;5;12m [39m[38;5;12marchived[39m[38;5;12m [39m[38;5;12mwebpages[39m[38;5;12m [39m[38;5;12mfrom[39m[38;5;12m [39m[38;5;14m[1mInternet[0m[38;5;14m[1m [0m[38;5;14m[1mArchive[0m[38;5;12m [39m[38;5;12m(https://web.archive.org),[39m[38;5;12m [39m[38;5;14m[1marchive.today[0m[38;5;12m [39m[38;5;12m(https://archive.today),[39m[38;5;12m [39m[38;5;14m[1mMemento[0m[38;5;12m [39m[38;5;12m(http://timetravel.mementoweb.org)[39m[38;5;12m [39m[38;5;12mand[39m[38;5;12m [39m[38;5;12mbeyond.[39m[38;5;12m [39m[48;2;30;30;40m[38;5;13m[3m(In[0m[48;2;30;30;40m[38;5;13m[3m [0m
|
||
[48;2;30;30;40m[38;5;13m[3mDevelopment)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mSecurityTrails[0m[38;5;12m (https://securitytrails.com/) - Web based archive for WHOIS and DNS records. REST API available free of charge.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mTempas v1[0m[38;5;12m (http://tempas.L3S.de/v1) - Temporal web archive search based on [39m[38;5;14m[1mDelicious[0m[38;5;12m (https://en.wikipedia.org/wiki/Delicious_(website)) tags. [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mTempas[0m[38;5;14m[1m [0m[38;5;14m[1mv2[0m[38;5;12m [39m[38;5;12m(http://tempas.L3S.de/v2)[39m[38;5;12m [39m[38;5;12m-[39m[38;5;12m [39m[38;5;12mTemporal[39m[38;5;12m [39m[38;5;12mweb[39m[38;5;12m [39m[38;5;12marchive[39m[38;5;12m [39m[38;5;12msearch[39m[38;5;12m [39m[38;5;12mbased[39m[38;5;12m [39m[38;5;12mon[39m[38;5;12m [39m[38;5;12mlinks[39m[38;5;12m [39m[38;5;12mand[39m[38;5;12m [39m[38;5;12manchor[39m[38;5;12m [39m[38;5;12mtexts[39m[38;5;12m [39m[38;5;12mextracted[39m[38;5;12m [39m[38;5;12mfrom[39m[38;5;12m [39m[38;5;12mthe[39m[38;5;12m [39m[38;5;12mGerman[39m[38;5;12m [39m[38;5;12mweb[39m[38;5;12m [39m[38;5;12mfrom[39m[38;5;12m [39m[38;5;12m1996[39m[38;5;12m [39m[38;5;12mto[39m[38;5;12m [39m[38;5;12m2013[39m[38;5;12m [39m[38;5;12m(results[39m[38;5;12m [39m[38;5;12mare[39m[38;5;12m [39m[38;5;12mnot[39m[38;5;12m [39m[38;5;12mlimited[39m[38;5;12m [39m[38;5;12mto[39m[38;5;12m [39m[38;5;12mGerman[39m[38;5;12m [39m[38;5;12mpages,[39m[38;5;12m [39m[38;5;12me.g.,[39m[38;5;12m [39m[38;5;14m[1mObama@2005-2009[0m[38;5;14m[1m [0m[38;5;14m[1min[0m[38;5;14m[1m [0m[38;5;14m[1mTempas[0m[38;5;12m [39m
|
||
[38;5;12m(http://tempas.l3s.de/v2/query?q=obama&from=2005&to=2009)).[39m[38;5;12m [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mwebarchive-discovery[0m[38;5;12m (https://github.com/ukwa/webarchive-discovery) - WARC and ARC full-text indexing and discovery tools, with a number of associated tools capable of using the index shown below. [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mShine[0m[38;5;12m (https://github.com/ukwa/shine) - A prototype web archives exploration UI, developed with researchers as part of the [39m[38;5;14m[1mBig UK Domain Data for the Arts and Humanities project[0m[38;5;12m (https://buddah.projects.history.ac.uk/). [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mSolrWayback[0m[38;5;12m [39m[38;5;12m(https://github.com/netarchivesuite/solrwayback)[39m[38;5;12m [39m[38;5;12m-[39m[38;5;12m [39m[38;5;12mA[39m[38;5;12m [39m[38;5;12mbackend[39m[38;5;12m [39m[38;5;12mJava[39m[38;5;12m [39m[38;5;12mand[39m[38;5;12m [39m[38;5;12mfrontend[39m[38;5;12m [39m[38;5;12mVUE[39m[38;5;12m [39m[38;5;12mJS[39m[38;5;12m [39m[38;5;12mproject[39m[38;5;12m [39m[38;5;12mwith[39m[38;5;12m [39m[38;5;12mfreetext[39m[38;5;12m [39m[38;5;12msearch[39m[38;5;12m [39m[38;5;12mand[39m[38;5;12m [39m[38;5;12ma[39m[38;5;12m [39m[38;5;12mbuild[39m[38;5;12m [39m[38;5;12min[39m[38;5;12m [39m[38;5;12mplayback[39m[38;5;12m [39m[38;5;12mengine.[39m[38;5;12m [39m[38;5;12mRequire[39m[38;5;12m [39m[38;5;12mWarc[39m[38;5;12m [39m[38;5;12mfiles[39m[38;5;12m [39m[38;5;12mhas[39m[38;5;12m [39m[38;5;12mbeen[39m[38;5;12m [39m[38;5;12mindex[39m[38;5;12m [39m[38;5;12mwith[39m[38;5;12m [39m[38;5;12mthe[39m[38;5;12m [39m[38;5;12mWarc-Indexer.[39m[38;5;12m [39m[38;5;12mThe[39m[38;5;12m [39m[38;5;12mweb[39m[38;5;12m [39m[38;5;12mapplication[39m[38;5;12m [39m[38;5;12malso[39m[38;5;12m [39m[38;5;12mhas[39m
|
||
[38;5;12ma[39m[38;5;12m [39m[38;5;12mwide[39m[38;5;12m [39m[38;5;12mrange[39m[38;5;12m [39m[38;5;12mof[39m[38;5;12m [39m[38;5;12mdata[39m[38;5;12m [39m[38;5;12mvisualization[39m[38;5;12m [39m[38;5;12mtools[39m[38;5;12m [39m[38;5;12mand[39m[38;5;12m [39m[38;5;12mdata[39m[38;5;12m [39m[38;5;12mexport[39m[38;5;12m [39m[38;5;12mtools[39m[38;5;12m [39m[38;5;12mthat[39m[38;5;12m [39m[38;5;12mcan[39m[38;5;12m [39m[38;5;12mbe[39m[38;5;12m [39m[38;5;12mused[39m[38;5;12m [39m[38;5;12mon[39m[38;5;12m [39m[38;5;12mthe[39m[38;5;12m [39m[38;5;12mwhole[39m[38;5;12m [39m[38;5;12mwebarchive.[39m[38;5;12m [39m[38;5;14m[1mSolrWayback[0m[38;5;14m[1m [0m[38;5;14m[1m4[0m[38;5;14m[1m [0m[38;5;14m[1mBundle[0m[38;5;14m[1m [0m[38;5;14m[1mrelease[0m[38;5;12m [39m[38;5;12m(https://github.com/netarchivesuite/solrwayback/releases)[39m[38;5;12m [39m[38;5;12mcontains[39m[38;5;12m [39m[38;5;12mall[39m[38;5;12m [39m[38;5;12mthe[39m[38;5;12m [39m[38;5;12msoftware[39m[38;5;12m [39m[38;5;12mand[39m[38;5;12m [39m[38;5;12mdependencies[39m[38;5;12m [39m[38;5;12min[39m[38;5;12m [39m[38;5;12man[39m[38;5;12m [39m
|
||
[38;5;12mout-of-the[39m[38;5;12m [39m[38;5;12mbox[39m[38;5;12m [39m[38;5;12msolution[39m[38;5;12m [39m[38;5;12mthat[39m[38;5;12m [39m[38;5;12mis[39m[38;5;12m [39m[38;5;12measy[39m[38;5;12m [39m[38;5;12mto[39m[38;5;12m [39m[38;5;12minstall.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mWarclight[0m[38;5;12m (https://github.com/archivesunleashed/warclight) - A Project Blacklight based Rails engine that supports the discovery of web archives held in the WARC and ARC formats. [39m[48;2;30;30;40m[38;5;13m[3m(In Development)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mWasp[0m[38;5;12m (https://github.com/webis-de/wasp) - A fully functional prototype of a personal [39m[38;5;14m[1mweb archive and search system[0m[38;5;12m (http://ceur-ws.org/Vol-2167/paper6.pdf). [39m[48;2;30;30;40m[38;5;13m[3m(In Development)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;12mOther possible options for builting a front-end are listed on in the [39m[48;5;235m[38;5;249mwebarchive-discovery[49m[39m[38;5;12m wiki, [39m[38;5;14m[1mhere[0m[38;5;12m (https://github.com/ukwa/webarchive-discovery/wiki/Front-ends).[39m
|
||
|
||
[38;2;255;187;0m[4mUtilities[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mArchiveTools[0m[38;5;12m (https://github.com/recrm/ArchiveTools) - Collection of tools to extract and interact with WARC files (Python).[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mcdx-toolkit[0m[38;5;12m (https://pypi.org/project/cdx-toolkit/) - Library and CLI to consult cdx indexes and create WARC extractions of subsets. Abstracts away Common Crawl's unusual crawl structure. [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mGo Get Crawl[0m[38;5;12m (https://github.com/karust/gogetcrawl) - Extract web archive data using [39m[38;5;14m[1mWayback Machine[0m[38;5;12m (https://web.archive.org/) and [39m[38;5;14m[1mCommon Crawl[0m[38;5;12m (https://commoncrawl.org/). [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mgowarcserver[0m[38;5;12m (https://github.com/nlnwa/gowarcserver) - [39m[38;5;14m[1mBadgerDB[0m[38;5;12m (https://github.com/dgraph-io/badger)-based capture index (CDX) and WARC record server, used to index and serve WARC files (Go).[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mhar2warc[0m[38;5;12m (https://github.com/webrecorder/har2warc) - Convert HTTP Archive (HAR) -> Web Archive (WARC) format (Python).[39m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mhttpreserve.info[0m[38;5;12m [39m[38;5;12m(https://httpreserve.info)[39m[38;5;12m [39m[38;5;12m-[39m[38;5;12m [39m[38;5;12mService[39m[38;5;12m [39m[38;5;12mto[39m[38;5;12m [39m[38;5;12mreturn[39m[38;5;12m [39m[38;5;12mthe[39m[38;5;12m [39m[38;5;12mstatus[39m[38;5;12m [39m[38;5;12mof[39m[38;5;12m [39m[38;5;12ma[39m[38;5;12m [39m[38;5;12mweb[39m[38;5;12m [39m[38;5;12mpage[39m[38;5;12m [39m[38;5;12mor[39m[38;5;12m [39m[38;5;12msave[39m[38;5;12m [39m[38;5;12mit[39m[38;5;12m [39m[38;5;12mto[39m[38;5;12m [39m[38;5;12mthe[39m[38;5;12m [39m[38;5;12mInternet[39m[38;5;12m [39m[38;5;12mArchive.[39m[38;5;12m [39m[38;5;12mHTTPreserve[39m[38;5;12m [39m[38;5;12mincludes[39m[38;5;12m [39m[38;5;12mdisambiguation[39m[38;5;12m [39m[38;5;12mof[39m[38;5;12m [39m[38;5;12mwell-known[39m[38;5;12m [39m[38;5;12mshort[39m[38;5;12m [39m[38;5;12mlink[39m[38;5;12m [39m[38;5;12mservices.[39m[38;5;12m [39m[38;5;12mIt[39m[38;5;12m [39m[38;5;12mreturns[39m[38;5;12m [39m[38;5;12mJSON[39m[38;5;12m [39m[38;5;12mvia[39m[38;5;12m [39m[38;5;12mthe[39m[38;5;12m [39m[38;5;12mbrowser[39m[38;5;12m [39m[38;5;12mor[39m[38;5;12m [39m[38;5;12mcommand[39m[38;5;12m [39m[38;5;12mline[39m[38;5;12m [39m
|
||
[38;5;12mvia[39m[38;5;12m [39m[38;5;12mCURL[39m[38;5;12m [39m[38;5;12musing[39m[38;5;12m [39m[38;5;12mGET.[39m[38;5;12m [39m[38;5;12mDescribes[39m[38;5;12m [39m[38;5;12mweb[39m[38;5;12m [39m[38;5;12msites[39m[38;5;12m [39m[38;5;12musing[39m[38;5;12m [39m[38;5;12mearliest[39m[38;5;12m [39m[38;5;12mand[39m[38;5;12m [39m[38;5;12mlatest[39m[38;5;12m [39m[38;5;12mdates[39m[38;5;12m [39m[38;5;12min[39m[38;5;12m [39m[38;5;12mthe[39m[38;5;12m [39m[38;5;12mInternet[39m[38;5;12m [39m[38;5;12mArchive[39m[38;5;12m [39m[38;5;12mand[39m[38;5;12m [39m[38;5;12mdemonstrates[39m[38;5;12m [39m[38;5;12mthe[39m[38;5;12m [39m[38;5;12mconstruction[39m[38;5;12m [39m[38;5;12mof[39m[38;5;12m [39m[38;5;12mRobust[39m[38;5;12m [39m[38;5;12mLinks[39m[38;5;12m [39m[38;5;12min[39m[38;5;12m [39m[38;5;12mits[39m[38;5;12m [39m[38;5;12moutput[39m[38;5;12m [39m[38;5;12musing[39m[38;5;12m [39m[38;5;12mthat[39m[38;5;12m [39m[38;5;12mrange.[39m[38;5;12m [39m[38;5;12m(Golang).[39m[38;5;12m [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mHTTPreserve linkstat[0m[38;5;12m (https://github.com/httpreserve/linkstat) - Command line implementation of [39m[38;5;14m[1marchive.org[0m[38;5;12m (https://archive.org/). (Golang). [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mInternet Archive Library[0m[38;5;12m (https://github.com/jjjake/internetarchive) - A command line tool and Python library for interacting directly with [39m[38;5;14m[1marchive.org[0m[38;5;12m (https://archive.org). (Python). [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mhttrack2warc[0m[38;5;12m (https://github.com/nla/httrack2warc) - Convert HTTrack archives to WARC format (Java).[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mMementoMap[0m[38;5;12m (https://github.com/oduwsdl/MementoMap) - A Tool to Summarize Web Archive Holdings (Python). [39m[48;2;30;30;40m[38;5;13m[3m(In Development)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mMemGator[0m[38;5;12m (https://github.com/oduwsdl/MemGator) - A Memento Aggregator CLI and Server (Golang). [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mnode-cdxj[0m[38;5;12m (https://github.com/N0taN3rd/node-cdxj) - [39m[38;5;14m[1mCDXJ[0m[38;5;12m (https://github.com/oduwsdl/ORS/wiki/CDXJ) file parser (Node.js). [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mOutbackCDX[0m[38;5;12m [39m[38;5;12m(https://github.com/nla/outbackcdx)[39m[38;5;12m [39m[38;5;12m-[39m[38;5;12m [39m[38;5;12mRocksDB-based[39m[38;5;12m [39m[38;5;12mcapture[39m[38;5;12m [39m[38;5;12mindex[39m[38;5;12m [39m[38;5;12m(CDX)[39m[38;5;12m [39m[38;5;12mserver[39m[38;5;12m [39m[38;5;12msupporting[39m[38;5;12m [39m[38;5;12mincremental[39m[38;5;12m [39m[38;5;12mupdates[39m[38;5;12m [39m[38;5;12mand[39m[38;5;12m [39m[38;5;12mcompression.[39m[38;5;12m [39m[38;5;12mCan[39m[38;5;12m [39m[38;5;12mbe[39m[38;5;12m [39m[38;5;12mused[39m[38;5;12m [39m[38;5;12mas[39m[38;5;12m [39m[38;5;12mbackend[39m[38;5;12m [39m[38;5;12mfor[39m[38;5;12m [39m[38;5;12mOpenWayback,[39m[38;5;12m [39m[38;5;12mPyWb[39m[38;5;12m [39m[38;5;12mand[39m[38;5;12m [39m[38;5;14m[1mHeritrix[0m[38;5;12m [39m
|
||
[38;5;12m(https://github.com/ukwa/ukwa-heritrix/blob/master/src/main/java/uk/bl/wap/modules/uriuniqfilters/OutbackCDXRecentlySeenUriUniqFilter.java).[39m[38;5;12m [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mpy-wasapi-client[0m[38;5;12m (https://github.com/unt-libraries/py-wasapi-client) - Command line application to download crawls from WASAPI (Python). [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mThe Unarchiver[0m[38;5;12m (https://theunarchiver.com/) - Program to extract the contents of many archive formats, inclusive of WARC, to a file system. Free variant of The Archive Browser (macOS only, Proprietary app).[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mtikalinkextract[0m[38;5;12m (https://github.com/httpreserve/tikalinkextract) - Extract hyperlinks as a seed for web archiving from folders of document types that can be parsed by Apache Tika (Golang, Apache Tika Server). [39m[48;2;30;30;40m[38;5;13m[3m(In Development)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mwasapi-downloader[0m[38;5;12m (https://github.com/sul-dlss/wasapi-downloader) - Java command line application to download crawls from WASAPI. [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mWarchaeology[0m[38;5;12m (https://nlnwa.github.io/warchaeology/) - Warchaeology is a collection of tools for inspecting, manipulating, deduplicating and validating WARC-files. [39m[48;2;30;30;40m[38;5;13m[3mStable[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mwarcdb[0m[38;5;12m (https://github.com/florents-Tselai/warcdb) - A command line utility (Python) for importing WARC files into a SQLite database. [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mwarcdedupe[0m[38;5;12m (https://gitlab.com/taricorp/warcdedupe) - WARC deduplication tool (and WARC library) written in Rust. (In Development)[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mwarc-safe[0m[38;5;12m (https://github.com/natliblux/warc-safe) - Automatic detection of viruses and NSFW content in WARC files.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mWarcPartitioner[0m[38;5;12m (https://github.com/helgeho/WarcPartitioner) - Partition (W)ARC Files by MIME Type and Year. [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mwarcrefs[0m[38;5;12m (https://github.com/arcalex/warcrefs) - Web archive deduplication tools. [39m[48;2;30;30;40m[38;5;13m[3mStable[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mwebarchive-indexing[0m[38;5;12m (https://github.com/ikreymer/webarchive-indexing) - Tools for bulk indexing of WARC/ARC files on Hadoop, EMR or local file system.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mwikiteam[0m[38;5;12m (https://github.com/WikiTeam/wikiteam) - Tools for downloading and preserving wikis. [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
|
||
[38;2;255;187;0m[4mWARC I/O Libraries[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mFastWARC[0m[38;5;12m (https://github.com/chatnoir-eu/chatnoir-resiliparse) - A high-performance WARC parsing library (Python).[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mHadoopConcatGz[0m[38;5;12m (https://github.com/helgeho/HadoopConcatGz) - A Splitable Hadoop InputFormat for Concatenated GZIP Files (and [39m[48;5;235m[38;5;249m*.warc.gz[49m[39m[38;5;12m). [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mjwarc[0m[38;5;12m (https://github.com/iipc/jwarc) - Read and write WARC files with a type safe API (Java).[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mJwat[0m[38;5;12m (https://github.com/netarchivesuite/jwat) - Libraries for reading/writing/validating WARC/ARC/GZIP files (Java). [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mJwat-Tools[0m[38;5;12m (https://github.com/netarchivesuite/jwat-tools) - Tools for reading/writing/validating WARC/ARC/GZIP files (Java). [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mnode-warc[0m[38;5;12m (https://github.com/N0taN3rd/node-warc) - Parse WARC files or create WARC files using either [39m[38;5;14m[1mElectron[0m[38;5;12m (https://electron.atom.io/) or [39m[38;5;14m[1mchrome-remote-interface[0m[38;5;12m (https://github.com/cyrus-and/chrome-remote-interface) (Node.js). [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mSparkling[0m[38;5;12m (https://github.com/internetarchive/Sparkling) - Internet Archive's Sparkling Data Processing Library. [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mUnwarcit[0m[38;5;12m (https://github.com/emmadickson/unwarcit) - Command line interface to unzip WARC and WACZ files (Python).[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mWarcat[0m[38;5;12m (https://github.com/chfoo/warcat) - Tool and library for handling Web ARChive (WARC) files (Python). [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mWarcat-rs[0m[38;5;12m (https://github.com/chfoo/warcat-rs) - Command-line tool and Rust library for handling Web ARChive (WARC) files. [39m[48;2;30;30;40m[38;5;13m[3m(In Development)[0m[38;5;12m [39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mwarcio[0m[38;5;12m (https://github.com/webrecorder/warcio) - Streaming WARC/ARC library for fast web archive IO (Python). [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mwarctools[0m[38;5;12m (https://github.com/internetarchive/warctools) - Library to work with ARC and WARC files (Python).[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mwebarchive[0m[38;5;12m (https://github.com/richardlehane/webarchive) - Golang readers for ARC and WARC webarchive formats (Golang).[39m
|
||
|
||
[38;2;255;187;0m[4mAnalysis[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mArchives Research Compute Hub[0m[38;5;12m (https://github.com/internetarchive/arch) - Web application for distributed compute analysis of Archive-It web archive collections. [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mArchiveSpark[0m[38;5;12m (https://github.com/helgeho/ArchiveSpark) - An Apache Spark framework (not only) for Web Archives that enables easy data processing, extraction as well as derivation. [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mArchives Unleashed Notebooks[0m[38;5;12m (https://github.com/archivesunleashed/notebooks) - Notebooks for working with web archives with the Archives Unleashed Toolkit, and derivatives generated by the Archives Unleashed Toolkit. [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mArchives Unleashed Toolkit[0m[38;5;12m (https://github.com/archivesunleashed/aut) - Archives Unleashed Toolkit (AUT) is an open-source platform for analyzing web archives with Apache Spark. [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mCommon Crawl Columnar Index[0m[38;5;12m (https://commoncrawl.org/tag/columnar-index/) - SQL-queryable index, with CDX info plus language classification. [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mCommon Crawl Web Graph[0m[38;5;12m (https://commoncrawl.org/category/web-graph/) - A host or domain-level graph of the web, with ranking information. [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mCommon Crawl Jupyter notebooks[0m[38;5;12m (https://github.com/commoncrawl/cc-notebooks) - A collection of notebooks using Common Crawl's various datasets. [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mTweet Archvies Unleashed Toolkit[0m[38;5;12m (https://github.com/archivesunleashed/twut) - An open-source toolkit for analyzing line-oriented JSON Twitter archives with Apache Spark. [39m[48;2;30;30;40m[38;5;13m[3m(In Development)[0m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mWeb Data Commons[0m[38;5;12m (http://webdatacommons.org/) - Structured data extracted from Common Crawl. [39m[48;2;30;30;40m[38;5;13m[3m(Stable)[0m
|
||
|
||
[38;2;255;187;0m[4mQuality Assurance[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mChrome Check My Links[0m[38;5;12m (https://chrome.google.com/webstore/detail/check-my-links/ojkcdipcgfaekbeaelaapakgnjflfglf) - Browser extension: a link checker with more options.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mChrome link checker[0m[38;5;12m (https://chrome.google.com/webstore/detail/link-checker/aibjbgmpmnidnmagaefhmcjhadpffaoi) - Browser extension: basic link checker.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mChrome link gopher[0m[38;5;12m (https://chrome.google.com/webstore/detail/bpjdkodgnbfalgghnbeggfbfjpcfamkf/publish-accepted?hl=en-US&gl=US) - Browser extension: link harvester on a page.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mChrome Open Multiple URLs[0m[38;5;12m (https://chrome.google.com/webstore/detail/open-multiple-urls/oifijhaokejakekmnjmphonojcfkpbbh?hl=de) - Browser extension: opens multiple URLs and also extracts URLs from text.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mChrome Revolver[0m[38;5;12m (https://chrome.google.com/webstore/detail/revolver-tabs/dlknooajieciikpedpldejhhijacnbda) - Browser extension: switches between browser tabs.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mFlameShot[0m[38;5;12m (https://github.com/lupoDharkael/flameshot) - Screen capture and annotation on Ubuntu.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mPlayOnLinux[0m[38;5;12m (https://www.playonlinux.com/en/) - For running Xenu and Notepad++ on Ubuntu.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mPlayOnMac[0m[38;5;12m (https://www.playonmac.com/en/) - For running Xenu and Notepad++ on macOS.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mWindows[0m[38;5;14m[1m [0m[38;5;14m[1mSnipping[0m[38;5;14m[1m [0m[38;5;14m[1mTool[0m[38;5;12m [39m[38;5;12m(https://support.microsoft.com/en-gb/help/13776/windows-use-snipping-tool-to-capture-screenshots)[39m[38;5;12m [39m[38;5;12m-[39m[38;5;12m [39m[38;5;12mWindows[39m[38;5;12m [39m[38;5;12mbuilt-in[39m[38;5;12m [39m[38;5;12mfor[39m[38;5;12m [39m[38;5;12mpartial[39m[38;5;12m [39m[38;5;12mscreen[39m[38;5;12m [39m[38;5;12mcapture[39m[38;5;12m [39m[38;5;12mand[39m[38;5;12m [39m[38;5;12mannotation.[39m[38;5;12m [39m[38;5;12mOn[39m[38;5;12m [39m[38;5;12mmacOS[39m[38;5;12m [39m[38;5;12myou[39m[38;5;12m [39m[38;5;12mcan[39m[38;5;12m [39m[38;5;12muse[39m[38;5;12m [39m[38;5;12mCommand[39m[38;5;12m [39m[38;5;12m+[39m[38;5;12m [39m[38;5;12mShift[39m[38;5;12m [39m[38;5;12m+[39m[38;5;12m [39m[38;5;12m4[39m[38;5;12m [39m[38;5;12m(keyboard[39m[38;5;12m [39m[38;5;12mshortcut[39m[38;5;12m [39m
|
||
[38;5;12mfor[39m[38;5;12m [39m[38;5;12mtaking[39m[38;5;12m [39m[38;5;12mpartial[39m[38;5;12m [39m[38;5;12mscreen[39m[38;5;12m [39m[38;5;12mcapture).[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mWineBottler[0m[38;5;12m (http://winebottler.kronenberg.org/) - For running Xenu and Notepad++ on macOS.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mxDoTool[0m[38;5;12m (https://github.com/jordansissel/xdotool) - Click automation on Ubuntu.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mXenu[0m[38;5;12m (http://home.snafu.de/tilman/xenulink.html) - Desktop link checker for Windows.[39m
|
||
|
||
[38;2;255;187;0m[4mCuration[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mZotero[0m[38;5;14m[1m [0m[38;5;14m[1mRobust[0m[38;5;14m[1m [0m[38;5;14m[1mLinks[0m[38;5;14m[1m [0m[38;5;14m[1mExtension[0m[38;5;12m [39m[38;5;12m(https://robustlinks.mementoweb.org/zotero/)[39m[38;5;12m [39m[38;5;12m-[39m[38;5;12m [39m[38;5;12mA[39m[38;5;12m [39m[38;5;14m[1mZotero[0m[38;5;12m [39m[38;5;12m(https://www.zotero.org/)[39m[38;5;12m [39m[38;5;12mextension[39m[38;5;12m [39m[38;5;12mthat[39m[38;5;12m [39m[38;5;12msubmits[39m[38;5;12m [39m[38;5;12mto[39m[38;5;12m [39m[38;5;12mand[39m[38;5;12m [39m[38;5;12mreads[39m[38;5;12m [39m[38;5;12mfrom[39m[38;5;12m [39m[38;5;12mweb[39m[38;5;12m [39m[38;5;12marchives.[39m[38;5;12m [39m[38;5;12mSource[39m[38;5;12m [39m[38;5;14m[1mon[0m[38;5;14m[1m [0m[38;5;14m[1mGitHub[0m[38;5;12m [39m[38;5;12m(https://github.com/lanl/Zotero-Robust-Links-Extension).[39m[38;5;12m [39m
|
||
[38;5;12mSupercedes[39m[38;5;12m [39m[38;5;14m[1mleonkt/zotero-memento[0m[38;5;12m [39m[38;5;12m(https://github.com/leonkt/zotero-memento).[39m
|
||
|
||
[38;2;255;187;0m[4mCommunity Resources[0m
|
||
|
||
[38;2;255;187;0m[4mOther Awesome Lists[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mWeb Archiving Community[0m[38;5;12m (https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community)[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mAwesome Memento[0m[38;5;12m (https://github.com/machawk1/awesome-memento)[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mThe WARC Ecosystem[0m[38;5;12m (http://www.archiveteam.org/index.php?title=The_WARC_Ecosystem)[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mThe Web Crawl section of COPTR[0m[38;5;12m (http://coptr.digipres.org/Category:Web_Crawl)[39m
|
||
|
||
[38;2;255;187;0m[4mBlogs and Scholarship[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mIIPC Blog[0m[38;5;12m (https://netpreserveblog.wordpress.com/)[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mWeb Archiving Roundtable[0m[38;5;12m (https://webarchivingrt.wordpress.com/) - Unofficial blog of the Web Archiving Roundtable of the [39m[38;5;14m[1mSociety of American Archivists[0m[38;5;12m (https://www2.archivists.org/) maintained by the members of the Web Archiving Roundtable.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mThe Web as History[0m[38;5;12m (https://www.uclpress.co.uk/products/84010) - An open-source book that provides a conceptual overview to web archiving research, as well as several case studies.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mWS-DL Blog[0m[38;5;12m (https://ws-dl.blogspot.com/) - Web Science and Digital Libraries Research Group blogs about various Web archiving related topics, scholarly work, and academic trip reports.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mDSHR's Blog[0m[38;5;12m (https://blog.dshr.org/) - David Rosenthal regularly reviews and summarizes work done in the Digital Preservation field.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mUK Web Archive Blog[0m[38;5;12m (https://blogs.bl.uk/webarchive/)[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mCommon Crawl Foundation Blog[0m[38;5;12m (https://commoncrawl.org/blog) - [39m[38;5;14m[1mrss[0m[38;5;12m (http://commoncrawl.org/blog/rss.xml)[39m
|
||
|
||
[38;2;255;187;0m[4mMailing Lists[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mCommon Crawl[0m[38;5;12m (https://groups.google.com/g/common-crawl)[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mIIPC[0m[38;5;12m (http://netpreserve.org/about-us/iipc-mailing-list/)[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mOpenWayback[0m[38;5;12m (https://groups.google.com/g/openwayback-dev)[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mWASAPI[0m[38;5;12m (https://groups.google.com/g/wasapi-community)[39m
|
||
|
||
[38;2;255;187;0m[4mSlack[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mIIPC Slack[0m[38;5;12m (https://iipc.slack.com/) - Ask [39m[38;5;14m[1m@netpreserve[0m[38;5;12m (https://twitter.com/NetPreserve?s=20) for access.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mArchives Unleashed Slack[0m[38;5;12m (https://archivesunleashed.slack.com/) - [39m[38;5;14m[1mFill out this request form[0m[38;5;12m (http://slack.archivesunleashed.org/) for access to a researcher group of people working with web archives.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mArchivers[0m[38;5;14m[1m [0m[38;5;14m[1mSlack[0m[38;5;12m [39m[38;5;12m(https://archivers.slack.com)[39m[38;5;12m [39m[38;5;12m-[39m[38;5;12m [39m[38;5;14m[1mInvite[0m[38;5;14m[1m [0m[38;5;14m[1myourself[0m[38;5;12m [39m[38;5;12m(https://archivers-slack.herokuapp.com/)[39m[38;5;12m [39m[38;5;12mto[39m[38;5;12m [39m[38;5;12ma[39m[38;5;12m [39m[38;5;12mmulti-disciplinary[39m[38;5;12m [39m[38;5;12meffort[39m[38;5;12m [39m[38;5;12mfor[39m[38;5;12m [39m[38;5;12marchiving[39m[38;5;12m [39m[38;5;12mprojects[39m[38;5;12m [39m[38;5;12mrun[39m[38;5;12m [39m[38;5;12min[39m[38;5;12m [39m[38;5;12maffiliation[39m[38;5;12m [39m[38;5;12mwith[39m[38;5;12m [39m[38;5;14m[1mEDGI[0m[38;5;12m [39m[38;5;12m(https://envirodatagov.org/archiving/)[39m[38;5;12m [39m[38;5;12mand[39m[38;5;12m [39m[38;5;14m[1mData[0m[38;5;14m[1m [0m[38;5;14m[1mTogether[0m
|
||
[38;5;12m(http://datatogether.org/).[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mCommon Crawl Foundation Partners[0m[38;5;12m (https://ccfpartners.slack.com/) (ask greg zat commoncrawl zot org for an invite)[39m
|
||
|
||
[38;2;255;187;0m[4mTwitter[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1m@NetPreserve[0m[38;5;12m (https://twitter.com/NetPreserve) - Official IIPC handle.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1m@WebSciDL[0m[38;5;12m (https://twitter.com/WebSciDL) - ODU Web Science and Digital Libraries Research Group.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1m#WebArchiving[0m[38;5;12m (https://twitter.com/search?q=%23webarchiving)[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1m#WebArchiveWednesday[0m[38;5;12m (https://twitter.com/hashtag/webarchivewednesday)[39m
|
||
|
||
[38;2;255;187;0m[4mWeb Archiving Service Providers[0m
|
||
|
||
[38;5;12mThe intention is that we only list services that allow web archives to be exported in standard formats (WARC or WACZ). But this is not an endorsement of these services, and readers should check and evaluate these options based on their needs. [39m
|
||
|
||
[38;2;255;187;0m[4mSelf-hostable, Open Source[0m
|
||
|
||
[48;2;30;30;40m[38;5;13m[3m [0m[48;2;30;30;40m[38;5;14m[1m[3mBrowsertrix[0m[48;2;30;30;40m[38;5;13m[3m (https://webrecorder.net/browsertrix/) - From [0m[48;2;30;30;40m[38;5;14m[1m[3mWebrecorder[0m[48;2;30;30;40m[38;5;13m[3m (https://webrecorder.net/), source available at .[0m
|
||
[48;2;30;30;40m[38;5;13m[3m [0m[48;2;30;30;40m[38;5;14m[1m[3mConifer[0m[48;2;30;30;40m[38;5;13m[3m (https://conifer.rhizome.org/) - From [0m[48;2;30;30;40m[38;5;14m[1m[3mRhizome[0m[48;2;30;30;40m[38;5;13m[3m (https://rhizome.org/), source available at .[0m
|
||
|
||
[38;2;255;187;0m[4mHosted, Closed Source[0m
|
||
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mArchive-It[0m[38;5;12m (https://archive-it.org/) - From the Internet Archive.[39m
|
||
[48;5;12m[38;5;11m⟡[49m[39m[38;5;12m [39m[38;5;14m[1mArkiwera[0m[38;5;12m (https://arkiwera.se/wp/websites/)[39m
|
||
[48;2;30;30;40m[38;5;13m[3m [0m[48;2;30;30;40m[38;5;14m[1m[3mHanzo[0m[48;2;30;30;40m[38;5;13m[3m (https://www.hanzo.co/chronicle)[0m
|
||
[48;2;30;30;40m[38;5;13m[3m [0m[48;2;30;30;40m[38;5;14m[1m[3mMirrorWeb[0m[48;2;30;30;40m[38;5;13m[3m (https://www.mirrorweb.com/solutions/capabilities/website-archiving)[0m
|
||
[48;2;30;30;40m[38;5;13m[3m [0m[48;2;30;30;40m[38;5;14m[1m[3mPageFreezer[0m[48;2;30;30;40m[38;5;13m[3m (https://www.pagefreezer.com/)[0m
|
||
[48;2;30;30;40m[38;5;13m[3m [0m[48;2;30;30;40m[38;5;14m[1m[3mSmarsh[0m[48;2;30;30;40m[38;5;13m[3m (https://www.smarsh.com/platform/compliance-management/web-archive)[0m
|
||
|
||
[38;5;12mwebarchiving Github: https://github.com/iipc/awesome-web-archiving[39m
|