diff --git a/.travis.yml b/.travis.yml index d4709b6..066e607 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,10 +1,10 @@ -language: ruby -rvm: - - 2.2 -before_script: - - gem install awesome_bot -script: - - site404=www.datawrangling.com,getglue-data.s3.amazonaws.com,archive.org/details/2011-05-calufa-twitter-sql,www.stats4stem.org,lib.stat.cmu.edu,http://www.oecd.org/document/0,census.gov/acs/www/data_documentation/data_release_info/ - - whtlist=travis,crawdad.cs.dartmouth.edu,data.nasdaq.com,137.189.35.203/WebUI/CatDatabase/catData.html,numbrary.com,www.cmr.osu.edu,gutenberg.org,donnees.gouv.qc.ca,data.rio.rj.gov.br,ntrl.ntis.gov,openflights.org,www.data.gov.bc.ca,earthdata.nasa,pgp-hms,cru.uea.ac.uk,networkdata.ics,datos.argentina,data.gov.ie,isi.edu,data.go.id,wiki.dbpedia,www.laval.ca,www.wunderground.com,data.lexingtonky.gov,arcgis,bixi - - site503=datamob.org,research.microsoft.com - - awesome_bot README.rst --allow-dupe --allow-redirect --set-timeout 5 --allow-timeout --white-list $site404,$whtlist,$site503 +# language: ruby +# rvm: +# - 2.2 +# before_script: +# - gem install awesome_bot +# script: +# - site404=www.datawrangling.com,getglue-data.s3.amazonaws.com,archive.org/details/2011-05-calufa-twitter-sql,www.stats4stem.org,lib.stat.cmu.edu,http://www.oecd.org/document/0,census.gov/acs/www/data_documentation/data_release_info/ +# - whtlist=travis,crawdad.cs.dartmouth.edu,data.nasdaq.com,137.189.35.203/WebUI/CatDatabase/catData.html,numbrary.com,www.cmr.osu.edu,gutenberg.org,donnees.gouv.qc.ca,data.rio.rj.gov.br,ntrl.ntis.gov,openflights.org,www.data.gov.bc.ca,earthdata.nasa,pgp-hms,cru.uea.ac.uk,networkdata.ics,datos.argentina,data.gov.ie,isi.edu,data.go.id,wiki.dbpedia,www.laval.ca,www.wunderground.com,data.lexingtonky.gov,arcgis,bixi +# - site503=datamob.org,research.microsoft.com +# - awesome_bot README.rst --allow-dupe --allow-redirect --set-timeout 5 --allow-timeout --white-list $site404,$whtlist,$site503 diff --git a/Government.rst b/Government.rst index 26555da..db7f229 100644 --- a/Government.rst +++ b/Government.rst @@ -85,6 +85,7 @@ Government * `Texas Open Data `_ * `The World Bank `_ * `Toronto, ON, Canada `_ +* `Tunisia `_ * `U.K. Government Data `_ * `U.S. American Community Survey `_ * `U.S. CDC Public Health datasets `_ @@ -100,4 +101,4 @@ Government * `Uruguay `_ * `Vancouver, BC Open Data Catalog `_ * `Victoria, BC, Canada `_ -* `Vienna, Austria `_ \ No newline at end of file +* `Vienna, Austria `_ diff --git a/README.rst b/README.rst index 0ee6b95..80ff418 100755 --- a/README.rst +++ b/README.rst @@ -3,8 +3,6 @@ Awesome Public Datasets .. image:: https://cdn.rawgit.com/sindresorhus/awesome/d7305f38d29fed78fa85652e3a63e154dd8e8829/media/badge.svg :alt: Awesome :target: https://github.com/sindresorhus/awesome -.. image:: https://travis-ci.org/caesar0301/awesome-public-datasets.svg - :target: https://travis-ci.org/caesar0301/awesome-public-datasets `This list of public data sources `_ are collected and tidied from blogs, answers, and user responses. @@ -29,7 +27,6 @@ Biology * `Broad Cancer Cell Line Encyclopedia (CCLE) `_ * `Broad Bioimage Benchmark Collection (BBBC) `_ * `Cell Image Library `_ -* `Collaborative Research in Computational Neuroscience (CRCNS) `_ * `Complete Genomics Public Data `_ * `EBI ArrayExpress `_ * `EBI Protein Data Bank in Europe `_ @@ -48,8 +45,7 @@ Biology * `MIT Cancer Genomics Data `_ * `NCBI Proteins `_ * `NCBI Taxonomy `_ -* `NeuroData `_ -* `NIH Microarray data `_ or `FTP `_ +* `NIH Microarray data `_ or `FTP `_ (see FTP link on `RAW `_) * `OpenSNP genotypes data `_ * `Pathguid - Protein-Protein Interactions Catalog `_ * `Protein Data Bank `_ @@ -62,7 +58,6 @@ Biology * `Stanford Microarray Data `_ * `Stowers Institute Original Data Repository `_ * `Systems Science of Biological Dynamics (SSBD) Database `_ -* `Temple University Hospital EEG Database `_ * `The Cancer Genome Atlas (TCGA), available via Broad GDAC `_ * `The Catalogue of Life `_ * `The Personal Genome Project `_ or `PGP `_ @@ -75,6 +70,7 @@ Climate/Weather --------------- * `Australian Weather `_ +* `Aviation Weather Center - Consistent, timely and accurate weather information for the world airspace system `_ * `Brazilian Weather - Historical data (In Portuguese) `_ * `Canadian Meteorological Centre `_ * `Climate Data from UEA (updated monthly) `_ @@ -126,8 +122,9 @@ Computer Networks * `CommonCrawl Web Data over 7 years `_ * `CRAWDAD Wireless datasets from Dartmouth Univ. `_ * `Criteo click-through data `_ +* `OONI: Open Observatory of Network Interference - Internet censorship data `_ * `Open Mobile Data by MobiPerf `_ -* `Rapid7 Sonar Internet Scans `_ +* `Rapid7 Sonar Internet Scans `_ * `UCSD Network Telescope, IPv4 /8 net `_ @@ -148,19 +145,35 @@ Data Challenges * `Kaggle Competition Data `_ * `KDD Cup by Tencent 2012 `_ * `Localytics Data Visualization Challenge `_ -* `Netflix Prize `_ +* `Netflix Prize `_ * `Space Apps Challenge `_ * `Telecom Italia Big Data Challenge `_ * `Yelp Dataset Challenge `_ * `Bruteforce Database `_ +* `TravisTorrent Dataset - MSR'2017 Mining Challenge `_ + + +Earth Science +------------- + +* `AQUASTAT - Global water resources and uses `_ +* `BODC - marine data of ~22K vars `_ +* `Earth Models `_ +* `EOSDIS - NASA's earth observing system data `_ +* `Integrated Marine Observing System (IMOS) - roughly 30TB of ocean measurements `_ or `on S3 `_ +* `Marinexplore - Open Oceanographic Data `_ +* `Smithsonian Institution Global Volcano and Eruption Database `_ +* `USGS Earthquake Archives `_ + Economics --------- -* `American Economic Ass (AEA) `_ +* `American Economic Association (AEA) `_ * `EconData from UMD `_ * `Economic Freedom of the World Data `_ * `Historical MacroEconomc Statistics `_ +* `International Economics Database `_ and `various data tools `_ * `International Trade Statistics `_ * `Internet Product Code Database `_ * `Joint External Debt Data Hub `_ @@ -188,13 +201,18 @@ Energy * `BLUEd `_ * `COMBED `_ * `Dataport `_ +* `DRED `_ * `ECO `_ * `EIA `_ +* `HES `_ - Household Electricity Study, UK * `HFED `_ * `iAWE `_ -* `Plaid `_ +* `PLAID `_ - the Plug Load Appliance Identification Dataset * `REDD `_ -* `UK-Dale `_ +* `Tracebase `_ +* `UK-DALE `_ - UK Domestic Appliance-Level Electricity +* `WHITED `_ + Finance @@ -209,23 +227,13 @@ Finance * `Quandl `_ * `St Louis Federal `_ * `Yahoo Finance `_ -* `NYSE Market Data `_ +* `NYSE Market Data `_ (see FTP link on `RAW `_) -Geology -------- +GIS +--- -* `Earth Models `_ -* `Smithsonian Institution Global Volcano and Eruption Database `_ -* `USGS Earthquake Archives `_ - - -GIS/Environment ---------------- - -* `BODC - marine data of ~22K vars `_ * `Cambridge, MA, US, GIS data on GitHub `_ -* `EOSDIS - NASA's earth observing system data `_ * `Factual Global Location Data `_ * `Geo Spatial Data from ASU `_ * `Geo Wiki Project - Citizen-driven Environmental Monitoring `_ @@ -233,11 +241,8 @@ GIS/Environment * `GeoNames Worldwide `_ * `Global Administrative Areas Database (GADM) `_ * `Homeland Infrastructure Foundation-Level Data `_ -* `Integrated Marine Observing System (IMOS) - roughly 30TB of ocean measurements `_ or `on S3 `_ -* `International Institute for Systems Analysis - GIS Datasets `_ * `Landsat 8 on AWS `_ * `List of all countries in all languages `_ -* `Marinexplore - Open Oceanographic Data `_ * `National Weather Service GIS Data Portal `_ * `Natural Earth - vectors and rasters of the world `_ * `OpenAddresses `_ @@ -251,6 +256,7 @@ GIS/Environment * `World boundaries from the U.S. Department of State `_ * `World countries in multiple formats `_ + Government ---------- @@ -269,7 +275,7 @@ Healthcare * `MeSH, the vocabulary thesaurus used for indexing articles for PubMed `_ * `Number of Ebola Cases and Deaths in Affected Countries (2014) `_ * `Open-ODS (structure of the UK NHS) `_ -* `OpenPaymentsData, Healthcare financial relationship data `_ +* `OpenPaymentsData, Healthcare financial relationship data `_ * `The Cancer Genome Atlas project (TCGA) `_ and `BigQuery table `_ * `World Health Organization Global Health Observatory `_ @@ -281,11 +287,13 @@ Image Processing * `2GB of Photos of Cats `_ or `Archive version `_ * `Affective Image Classification `_ * `Animals with attributes `_ +* `Chars74K dataset, Character Recognition in Natural Images (both English and Kannada are available) `_ * `Face Recognition Benchmark `_ * `ImageNet (in WordNet hierarchy) `_ * `Indoor Scene Recognition `_ * `International Affective Picture System, UFL `_ * `Massive Visual Memory Stimuli, MIT `_ +* `MNIST database of handwritten digits, near 1 million examples `_ * `Several Shape-from-Silhouette Datasets `_ * `Stanford Dogs Dataset `_ * `SUN database, MIT `_ @@ -308,6 +316,7 @@ Machine Learning * `Machine Learning Data Set Repository `_ * `Million Song Dataset `_ * `More Song Datasets `_ +* `New Yorker caption contest ratings `_ * `MovieLens Data Sets `_ * `RDataMining - "R and Data Mining" ebook data `_ * `Registered Meteorites on Earth `_ @@ -339,18 +348,43 @@ Natural Language * `Flickr Personal Taxonomies `_ * `Freebase.com of people, places, and things `_ * `Google Books Ngrams (2.2TB) `_ +* `Google MC-AFP, generated based on the public available Gigaword dataset using Paragraph Vectors `_ * `Google Web 5gram (1TB, 2006) `_ * `Gutenberg eBooks List `_ * `Hansards text chunks of Canadian Parliament `_ * `Machine Comprehension Test (MCTest) of text from Microsoft Research `_ * `Machine Translation of European languages `_ +* `Multi-Domain Sentiment Dataset (version 2.0) `_ +* `Microsoft MAchine Reading COmprehension Dataset (or MS MARCO) `_ * `Personae Corpus `_ * `SaudiNewsNet Collection of Saudi Newspaper Articles (Arabic, 30K articles) `_ * `SMS Spam Collection in English `_ * `USENET postings corpus of 2005~2011 `_ * `Wikidata - Wikipedia databases `_ * `Wikipedia Links data - 40 Million Entities in Context `_ +* `Universal Dependencies `_ * `WordNet databases and tools `_ +* `Open Multilingual Wordnet `_ +* `Automatic Keyphrase Extracttion `_ + + +Neuroscience +------------- + +* `Allen Institute Datasets `_ +* `Brain Catalogue `_ +* `Brainomics `_ +* `CodeNeuro Datasets `_ +* `Collaborative Research in Computational Neuroscience (CRCNS) `_ +* `FCP-INDI `_ +* `Human Connectome Project `_ +* `NDAR `_ +* `NIMH Data Archive `_ +* `NeuroData `_ +* `OASIS `_ +* `OpenfMRI `_ +* `Neuroelectro `_ +* `Study Forrest `_ Physics @@ -383,6 +417,7 @@ Public Domains * `Infochimps `_ * `KDNuggets Data Collections `_ * `Microsoft Azure Data Market Free DataSets `_ +* `Microsoft Data Science for Research `_ * `Numbray `_ * `Open Library Data Dumps `_ * `Reddit Datasets `_ @@ -424,7 +459,6 @@ Social Networks * `Facebook Data Scrape (2005) `_ * `Facebook Social Networks from LAW (since 2007) `_ * `Foursquare from UMN/Sarwat (2013) `_ -* `GetGlue - users rating TV shows `_ * `GitHub Collaboration Archive `_ * `Google Scholar citation relations `_ * `High-Resolution Contact Networks from Wearable Sensors `_ @@ -465,6 +499,7 @@ Social Sciences * `International Studies Compendium Project `_ * `James McGuire Cross National Data `_ * `MacroData Guide by Norsk samfunnsvitenskapelig datatjeneste `_ +* `Minnesota Population Center `_ * `MIT Reality Mining Dataset `_ * `Open Crime and Policing Data in England, Wales and Northern Ireland `_ * `Paul Hensel General International Data Page `_ @@ -474,8 +509,9 @@ Social Sciences * `StackExchange Data Explorer `_ * `Terrorism Research and Analysis Consortium `_ * `Texas Inmates Executed Since 1984 `_ -* `Titanic Survival Data Set `_ +* `Titanic Survival Data Set `_ or `on Kaggle `_ * `UCB's Archive of Social Science Data (D-Lab) `_ +* `Uppsala Conflict Data Program `_ * `UCLA Social Sciences Data Archive `_ * `UN Civil Society Database `_ * `Universities Worldwide `_ @@ -533,7 +569,7 @@ Transportation * `RITA Airline On-Time Performance data `_ * `RITA/BTS transport data collection (TranStat) `_ * `Toronto Bike Share Stations (XML file) `_ -* `Transport for London (TFL) `_ +* `Transport for London (TFL) `_ * `Travel Tracker Survey (TTS) for Chicago `_ * `U.S. Bureau of Transportation Statistics (BTS) `_ * `U.S. Domestic Flights 1990 to 2009 `_ @@ -551,4 +587,3 @@ Complementary Collections * Quora: `Where can I find large datasets open to the public? `_ * RS.io: `100+ Interesting Data Sets for Statistics `_ * StaTrek: `Leveraging open data to understand urban lives `_ -