From f779555aa2b240e5e000b3e51f47b2d6b064c147 Mon Sep 17 00:00:00 2001 From: CoprDistGit Date: Wed, 10 May 2023 04:03:29 +0000 Subject: automatic import of python-ml-datasets --- .gitignore | 1 + python-ml-datasets.spec | 553 ++++++++++++++++++++++++++++++++++++++++++++++++ sources | 1 + 3 files changed, 555 insertions(+) create mode 100644 python-ml-datasets.spec create mode 100644 sources diff --git a/.gitignore b/.gitignore index e69de29..ab887e7 100644 --- a/.gitignore +++ b/.gitignore @@ -0,0 +1 @@ +/ml_datasets-0.2.0.tar.gz diff --git a/python-ml-datasets.spec b/python-ml-datasets.spec new file mode 100644 index 0000000..fbcbe84 --- /dev/null +++ b/python-ml-datasets.spec @@ -0,0 +1,553 @@ +%global _empty_manifest_terminate_build 0 +Name: python-ml-datasets +Version: 0.2.0 +Release: 1 +Summary: Machine Learning dataset loaders +License: MIT +URL: https://github.com/explosion/ml-datasets +Source0: https://mirrors.nju.edu.cn/pypi/web/packages/3c/a8/149700bd6087fbffdbe85d32a7587f497cf45c432864d0000eef6bad1020/ml_datasets-0.2.0.tar.gz +BuildArch: noarch + +Requires: python3-numpy +Requires: python3-tqdm +Requires: python3-srsly +Requires: python3-catalogue + +%description + + +# Machine learning dataset loaders for testing and examples + +Loaders for various machine learning datasets for testing and example scripts. +Previously in `thinc.extra.datasets`. + +[![PyPi Version](https://img.shields.io/pypi/v/ml-datasets.svg?style=flat-square&logo=pypi&logoColor=white)](https://pypi.python.org/pypi/ml-datasets) + +## Setup and installation + +The package can be installed via pip: + +```bash +pip install ml-datasets +``` + +## Loaders + +Loaders can be imported directly or used via their string name (which is useful if they're set via command line arguments). Some loaders may take arguments – see the source for details. + +```python +# Import directly +from ml_datasets import imdb +train_data, dev_data = imdb() +``` + +```python +# Load via registry +from ml_datasets import loaders +imdb_loader = loaders.get("imdb") +train_data, dev_data = imdb_loader() +``` + +### Available loaders + +#### NLP datasets + +| ID / Function | Description | NLP task | From URL | +| -------------------- | -------------------------------------------- | ----------------------------------------- | :------: | +| `imdb` | IMDB sentiment dataset | Binary classification: sentiment analysis | ✓ | +| `dbpedia` | DBPedia ontology dataset | Multi-class single-label classification | ✓ | +| `cmu` | CMU movie genres dataset | Multi-class, multi-label classification | ✓ | +| `quora_questions` | Duplicate Quora questions dataset | Detecting duplicate questions | ✓ | +| `reuters` | Reuters dataset (texts not included) | Multi-class multi-label classification | ✓ | +| `snli` | Stanford Natural Language Inference corpus | Recognizing textual entailment | ✓ | +| `stack_exchange` | Stack Exchange dataset | Question Answering | | +| `ud_ancora_pos_tags` | Universal Dependencies Spanish AnCora corpus | POS tagging | ✓ | +| `ud_ewtb_pos_tags` | Universal Dependencies English EWT corpus | POS tagging | ✓ | +| `wikiner` | WikiNER data | Named entity recognition | | + +#### Other ML datasets + +| ID / Function | Description | ML task | From URL | +| ------------- | ----------- | ----------------- | :------: | +| `mnist` | MNIST data | Image recognition | ✓ | + +### Dataset details + +#### IMDB + +Each instance contains the text of a movie review, and a sentiment expressed as `0` or `1`. + +```python +train_data, dev_data = ml_datasets.imdb() +for text, annot in train_data[0:5]: + print(f"Review: {text}") + print(f"Sentiment: {annot}") +``` + +- Download URL: [http://ai.stanford.edu/~amaas/data/sentiment/](http://ai.stanford.edu/~amaas/data/sentiment/) +- Citation: [Andrew L. Maas et al., 2011](https://www.aclweb.org/anthology/P11-1015/) + +| Property | Training | Dev | +| ------------------- | ---------------- | ---------------- | +| # Instances | 25000 | 25000 | +| Label values | {`0`, `1`} | {`0`, `1`} | +| Labels per instance | Single | Single | +| Label distribution | Balanced (50/50) | Balanced (50/50) | + +#### DBPedia + +Each instance contains an ontological description, and a classification into one of the 14 distinct labels. + +```python +train_data, dev_data = ml_datasets.dbpedia() +for text, annot in train_data[0:5]: + print(f"Text: {text}") + print(f"Category: {annot}") +``` + +- Download URL: [Via fast.ai](https://course.fast.ai/datasets) +- Original citation: [Xiang Zhang et al., 2015](https://arxiv.org/abs/1509.01626) + +| Property | Training | Dev | +| ------------------- | -------- | -------- | +| # Instances | 560000 | 70000 | +| Label values | `1`-`14` | `1`-`14` | +| Labels per instance | Single | Single | +| Label distribution | Balanced | Balanced | + +#### CMU + +Each instance contains a movie description, and a classification into a list of appropriate genres. + +```python +train_data, dev_data = ml_datasets.cmu() +for text, annot in train_data[0:5]: + print(f"Text: {text}") + print(f"Genres: {annot}") +``` + +- Download URL: [http://www.cs.cmu.edu/~ark/personas/](http://www.cs.cmu.edu/~ark/personas/) +- Original citation: [David Bamman et al., 2013](https://www.aclweb.org/anthology/P13-1035/) + +| Property | Training | Dev | +| ------------------- | --------------------------------------------------------------------------------------------- | --- | +| # Instances | 41793 | 0 | +| Label values | 363 different genres | - | +| Labels per instance | Multiple | - | +| Label distribution | Imbalanced: 147 labels with less than 20 examples, while `Drama` occurs more than 19000 times | - | + +#### Quora + +```python +train_data, dev_data = ml_datasets.quora_questions() +for questions, annot in train_data[0:50]: + q1, q2 = questions + print(f"Question 1: {q1}") + print(f"Question 2: {q2}") + print(f"Similarity: {annot}") +``` + +Each instance contains two quora questions, and a label indicating whether or not they are duplicates (`0`: no, `1`: yes). +The ground-truth labels contain some amount of noise: they are not guaranteed to be perfect. + +- Download URL: [http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv](http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv) +- Original citation: [Kornél Csernai et al., 2017](https://www.quora.com/q/quoradata/First-Quora-Dataset-Release-Question-Pairs) + +| Property | Training | Dev | +| ------------------- | ------------------------- | ------------------------- | +| # Instances | 363859 | 40429 | +| Label values | {`0`, `1`} | {`0`, `1`} | +| Labels per instance | Single | Single | +| Label distribution | Imbalanced: 63% label `0` | Imbalanced: 63% label `0` | + +### Registering loaders + +Loaders can be registered externally using the `loaders` registry as a decorator. For example: + +```python +@ml_datasets.loaders("my_custom_loader") +def my_custom_loader(): + return load_some_data() + +assert "my_custom_loader" in ml_datasets.loaders +``` + + + + +%package -n python3-ml-datasets +Summary: Machine Learning dataset loaders +Provides: python-ml-datasets +BuildRequires: python3-devel +BuildRequires: python3-setuptools +BuildRequires: python3-pip +%description -n python3-ml-datasets + + +# Machine learning dataset loaders for testing and examples + +Loaders for various machine learning datasets for testing and example scripts. +Previously in `thinc.extra.datasets`. + +[![PyPi Version](https://img.shields.io/pypi/v/ml-datasets.svg?style=flat-square&logo=pypi&logoColor=white)](https://pypi.python.org/pypi/ml-datasets) + +## Setup and installation + +The package can be installed via pip: + +```bash +pip install ml-datasets +``` + +## Loaders + +Loaders can be imported directly or used via their string name (which is useful if they're set via command line arguments). Some loaders may take arguments – see the source for details. + +```python +# Import directly +from ml_datasets import imdb +train_data, dev_data = imdb() +``` + +```python +# Load via registry +from ml_datasets import loaders +imdb_loader = loaders.get("imdb") +train_data, dev_data = imdb_loader() +``` + +### Available loaders + +#### NLP datasets + +| ID / Function | Description | NLP task | From URL | +| -------------------- | -------------------------------------------- | ----------------------------------------- | :------: | +| `imdb` | IMDB sentiment dataset | Binary classification: sentiment analysis | ✓ | +| `dbpedia` | DBPedia ontology dataset | Multi-class single-label classification | ✓ | +| `cmu` | CMU movie genres dataset | Multi-class, multi-label classification | ✓ | +| `quora_questions` | Duplicate Quora questions dataset | Detecting duplicate questions | ✓ | +| `reuters` | Reuters dataset (texts not included) | Multi-class multi-label classification | ✓ | +| `snli` | Stanford Natural Language Inference corpus | Recognizing textual entailment | ✓ | +| `stack_exchange` | Stack Exchange dataset | Question Answering | | +| `ud_ancora_pos_tags` | Universal Dependencies Spanish AnCora corpus | POS tagging | ✓ | +| `ud_ewtb_pos_tags` | Universal Dependencies English EWT corpus | POS tagging | ✓ | +| `wikiner` | WikiNER data | Named entity recognition | | + +#### Other ML datasets + +| ID / Function | Description | ML task | From URL | +| ------------- | ----------- | ----------------- | :------: | +| `mnist` | MNIST data | Image recognition | ✓ | + +### Dataset details + +#### IMDB + +Each instance contains the text of a movie review, and a sentiment expressed as `0` or `1`. + +```python +train_data, dev_data = ml_datasets.imdb() +for text, annot in train_data[0:5]: + print(f"Review: {text}") + print(f"Sentiment: {annot}") +``` + +- Download URL: [http://ai.stanford.edu/~amaas/data/sentiment/](http://ai.stanford.edu/~amaas/data/sentiment/) +- Citation: [Andrew L. Maas et al., 2011](https://www.aclweb.org/anthology/P11-1015/) + +| Property | Training | Dev | +| ------------------- | ---------------- | ---------------- | +| # Instances | 25000 | 25000 | +| Label values | {`0`, `1`} | {`0`, `1`} | +| Labels per instance | Single | Single | +| Label distribution | Balanced (50/50) | Balanced (50/50) | + +#### DBPedia + +Each instance contains an ontological description, and a classification into one of the 14 distinct labels. + +```python +train_data, dev_data = ml_datasets.dbpedia() +for text, annot in train_data[0:5]: + print(f"Text: {text}") + print(f"Category: {annot}") +``` + +- Download URL: [Via fast.ai](https://course.fast.ai/datasets) +- Original citation: [Xiang Zhang et al., 2015](https://arxiv.org/abs/1509.01626) + +| Property | Training | Dev | +| ------------------- | -------- | -------- | +| # Instances | 560000 | 70000 | +| Label values | `1`-`14` | `1`-`14` | +| Labels per instance | Single | Single | +| Label distribution | Balanced | Balanced | + +#### CMU + +Each instance contains a movie description, and a classification into a list of appropriate genres. + +```python +train_data, dev_data = ml_datasets.cmu() +for text, annot in train_data[0:5]: + print(f"Text: {text}") + print(f"Genres: {annot}") +``` + +- Download URL: [http://www.cs.cmu.edu/~ark/personas/](http://www.cs.cmu.edu/~ark/personas/) +- Original citation: [David Bamman et al., 2013](https://www.aclweb.org/anthology/P13-1035/) + +| Property | Training | Dev | +| ------------------- | --------------------------------------------------------------------------------------------- | --- | +| # Instances | 41793 | 0 | +| Label values | 363 different genres | - | +| Labels per instance | Multiple | - | +| Label distribution | Imbalanced: 147 labels with less than 20 examples, while `Drama` occurs more than 19000 times | - | + +#### Quora + +```python +train_data, dev_data = ml_datasets.quora_questions() +for questions, annot in train_data[0:50]: + q1, q2 = questions + print(f"Question 1: {q1}") + print(f"Question 2: {q2}") + print(f"Similarity: {annot}") +``` + +Each instance contains two quora questions, and a label indicating whether or not they are duplicates (`0`: no, `1`: yes). +The ground-truth labels contain some amount of noise: they are not guaranteed to be perfect. + +- Download URL: [http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv](http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv) +- Original citation: [Kornél Csernai et al., 2017](https://www.quora.com/q/quoradata/First-Quora-Dataset-Release-Question-Pairs) + +| Property | Training | Dev | +| ------------------- | ------------------------- | ------------------------- | +| # Instances | 363859 | 40429 | +| Label values | {`0`, `1`} | {`0`, `1`} | +| Labels per instance | Single | Single | +| Label distribution | Imbalanced: 63% label `0` | Imbalanced: 63% label `0` | + +### Registering loaders + +Loaders can be registered externally using the `loaders` registry as a decorator. For example: + +```python +@ml_datasets.loaders("my_custom_loader") +def my_custom_loader(): + return load_some_data() + +assert "my_custom_loader" in ml_datasets.loaders +``` + + + + +%package help +Summary: Development documents and examples for ml-datasets +Provides: python3-ml-datasets-doc +%description help + + +# Machine learning dataset loaders for testing and examples + +Loaders for various machine learning datasets for testing and example scripts. +Previously in `thinc.extra.datasets`. + +[![PyPi Version](https://img.shields.io/pypi/v/ml-datasets.svg?style=flat-square&logo=pypi&logoColor=white)](https://pypi.python.org/pypi/ml-datasets) + +## Setup and installation + +The package can be installed via pip: + +```bash +pip install ml-datasets +``` + +## Loaders + +Loaders can be imported directly or used via their string name (which is useful if they're set via command line arguments). Some loaders may take arguments – see the source for details. + +```python +# Import directly +from ml_datasets import imdb +train_data, dev_data = imdb() +``` + +```python +# Load via registry +from ml_datasets import loaders +imdb_loader = loaders.get("imdb") +train_data, dev_data = imdb_loader() +``` + +### Available loaders + +#### NLP datasets + +| ID / Function | Description | NLP task | From URL | +| -------------------- | -------------------------------------------- | ----------------------------------------- | :------: | +| `imdb` | IMDB sentiment dataset | Binary classification: sentiment analysis | ✓ | +| `dbpedia` | DBPedia ontology dataset | Multi-class single-label classification | ✓ | +| `cmu` | CMU movie genres dataset | Multi-class, multi-label classification | ✓ | +| `quora_questions` | Duplicate Quora questions dataset | Detecting duplicate questions | ✓ | +| `reuters` | Reuters dataset (texts not included) | Multi-class multi-label classification | ✓ | +| `snli` | Stanford Natural Language Inference corpus | Recognizing textual entailment | ✓ | +| `stack_exchange` | Stack Exchange dataset | Question Answering | | +| `ud_ancora_pos_tags` | Universal Dependencies Spanish AnCora corpus | POS tagging | ✓ | +| `ud_ewtb_pos_tags` | Universal Dependencies English EWT corpus | POS tagging | ✓ | +| `wikiner` | WikiNER data | Named entity recognition | | + +#### Other ML datasets + +| ID / Function | Description | ML task | From URL | +| ------------- | ----------- | ----------------- | :------: | +| `mnist` | MNIST data | Image recognition | ✓ | + +### Dataset details + +#### IMDB + +Each instance contains the text of a movie review, and a sentiment expressed as `0` or `1`. + +```python +train_data, dev_data = ml_datasets.imdb() +for text, annot in train_data[0:5]: + print(f"Review: {text}") + print(f"Sentiment: {annot}") +``` + +- Download URL: [http://ai.stanford.edu/~amaas/data/sentiment/](http://ai.stanford.edu/~amaas/data/sentiment/) +- Citation: [Andrew L. Maas et al., 2011](https://www.aclweb.org/anthology/P11-1015/) + +| Property | Training | Dev | +| ------------------- | ---------------- | ---------------- | +| # Instances | 25000 | 25000 | +| Label values | {`0`, `1`} | {`0`, `1`} | +| Labels per instance | Single | Single | +| Label distribution | Balanced (50/50) | Balanced (50/50) | + +#### DBPedia + +Each instance contains an ontological description, and a classification into one of the 14 distinct labels. + +```python +train_data, dev_data = ml_datasets.dbpedia() +for text, annot in train_data[0:5]: + print(f"Text: {text}") + print(f"Category: {annot}") +``` + +- Download URL: [Via fast.ai](https://course.fast.ai/datasets) +- Original citation: [Xiang Zhang et al., 2015](https://arxiv.org/abs/1509.01626) + +| Property | Training | Dev | +| ------------------- | -------- | -------- | +| # Instances | 560000 | 70000 | +| Label values | `1`-`14` | `1`-`14` | +| Labels per instance | Single | Single | +| Label distribution | Balanced | Balanced | + +#### CMU + +Each instance contains a movie description, and a classification into a list of appropriate genres. + +```python +train_data, dev_data = ml_datasets.cmu() +for text, annot in train_data[0:5]: + print(f"Text: {text}") + print(f"Genres: {annot}") +``` + +- Download URL: [http://www.cs.cmu.edu/~ark/personas/](http://www.cs.cmu.edu/~ark/personas/) +- Original citation: [David Bamman et al., 2013](https://www.aclweb.org/anthology/P13-1035/) + +| Property | Training | Dev | +| ------------------- | --------------------------------------------------------------------------------------------- | --- | +| # Instances | 41793 | 0 | +| Label values | 363 different genres | - | +| Labels per instance | Multiple | - | +| Label distribution | Imbalanced: 147 labels with less than 20 examples, while `Drama` occurs more than 19000 times | - | + +#### Quora + +```python +train_data, dev_data = ml_datasets.quora_questions() +for questions, annot in train_data[0:50]: + q1, q2 = questions + print(f"Question 1: {q1}") + print(f"Question 2: {q2}") + print(f"Similarity: {annot}") +``` + +Each instance contains two quora questions, and a label indicating whether or not they are duplicates (`0`: no, `1`: yes). +The ground-truth labels contain some amount of noise: they are not guaranteed to be perfect. + +- Download URL: [http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv](http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv) +- Original citation: [Kornél Csernai et al., 2017](https://www.quora.com/q/quoradata/First-Quora-Dataset-Release-Question-Pairs) + +| Property | Training | Dev | +| ------------------- | ------------------------- | ------------------------- | +| # Instances | 363859 | 40429 | +| Label values | {`0`, `1`} | {`0`, `1`} | +| Labels per instance | Single | Single | +| Label distribution | Imbalanced: 63% label `0` | Imbalanced: 63% label `0` | + +### Registering loaders + +Loaders can be registered externally using the `loaders` registry as a decorator. For example: + +```python +@ml_datasets.loaders("my_custom_loader") +def my_custom_loader(): + return load_some_data() + +assert "my_custom_loader" in ml_datasets.loaders +``` + + + + +%prep +%autosetup -n ml-datasets-0.2.0 + +%build +%py3_build + +%install +%py3_install +install -d -m755 %{buildroot}/%{_pkgdocdir} +if [ -d doc ]; then cp -arf doc %{buildroot}/%{_pkgdocdir}; fi +if [ -d docs ]; then cp -arf docs %{buildroot}/%{_pkgdocdir}; fi +if [ -d example ]; then cp -arf example %{buildroot}/%{_pkgdocdir}; fi +if [ -d examples ]; then cp -arf examples %{buildroot}/%{_pkgdocdir}; fi +pushd %{buildroot} +if [ -d usr/lib ]; then + find usr/lib -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/lib64 ]; then + find usr/lib64 -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/bin ]; then + find usr/bin -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/sbin ]; then + find usr/sbin -type f -printf "/%h/%f\n" >> filelist.lst +fi +touch doclist.lst +if [ -d usr/share/man ]; then + find usr/share/man -type f -printf "/%h/%f.gz\n" >> doclist.lst +fi +popd +mv %{buildroot}/filelist.lst . +mv %{buildroot}/doclist.lst . + +%files -n python3-ml-datasets -f filelist.lst +%dir %{python3_sitelib}/* + +%files help -f doclist.lst +%{_docdir}/* + +%changelog +* Wed May 10 2023 Python_Bot - 0.2.0-1 +- Package Spec generated diff --git a/sources b/sources new file mode 100644 index 0000000..6e4f87d --- /dev/null +++ b/sources @@ -0,0 +1 @@ +da3d4bf661213c6f6edac48a6c599639 ml_datasets-0.2.0.tar.gz -- cgit v1.2.3