summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCoprDistGit <infra@openeuler.org>2023-06-20 04:19:03 +0000
committerCoprDistGit <infra@openeuler.org>2023-06-20 04:19:03 +0000
commitcda12032f7d4bd9edc3104db3f1cc8117f9d9cf0 (patch)
tree0da3de603ef1f600c3651e40f9db01ed76af1246
parent84a0020c603a8c2c227a05a380a355b5e4d51bd7 (diff)
automatic import of python-smile-datasetsopeneuler20.03
-rw-r--r--.gitignore1
-rw-r--r--python-smile-datasets.spec434
-rw-r--r--sources1
3 files changed, 436 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
index e69de29..d7f92c6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1 @@
+/smile-datasets-0.0.6.tar.gz
diff --git a/python-smile-datasets.spec b/python-smile-datasets.spec
new file mode 100644
index 0000000..991ed74
--- /dev/null
+++ b/python-smile-datasets.spec
@@ -0,0 +1,434 @@
+%global _empty_manifest_terminate_build 0
+Name: python-smile-datasets
+Version: 0.0.6
+Release: 1
+Summary: La**S**t **mile** datasets: Use `tf.data` to solve the last mile data loading problem for tensorflow.
+License: Apache Software License
+URL: https://github.com/luozhouyang/smile-datasets
+Source0: https://mirrors.aliyun.com/pypi/web/packages/72/28/bdee1d8fadf99f99daea01b0a09546932871fb995e82838c6c59e5c94ffb/smile-datasets-0.0.6.tar.gz
+BuildArch: noarch
+
+Requires: python3-tokenizers
+Requires: python3-tensorflow
+
+%description
+# smile-datasets
+
+![Python package](https://github.com/luozhouyang/smile-datasets/workflows/Python%20package/badge.svg)
+[![PyPI version](https://badge.fury.io/py/smile-datasets.svg)](https://badge.fury.io/py/smile-datasets)
+[![Python](https://img.shields.io/pypi/pyversions/smile-datasets.svg?style=plastic)](https://badge.fury.io/py/smile-datasets)
+
+
+La**S**t **mile** Datasets: Use `tf.data` to solve the last mile data loading problem for tensorflow.
+
+If you want to load public datasets, try:
+
+* [tensorflow/datasets](https://github.com/tensorflow/datasets)
+* [huggingface/datasets](https://github.com/huggingface/datasets)
+
+If you want to load local, personal dataset with minimized boilerplate, use **Smile Dataset**!
+
+## Support Matrix
+
+| task | supported | core abstractions |
+|:-----------------------|:-----------|:------------------|
+| question answering | [x] | `ExampleForQuestionAnswering`, `DatasetForQuestionAnswering`, `DatapipeForQuestionAnswering`|
+| masked language model | [x] | `ExampleForMaskedLanguageModel`, `DatasetForMaskedLanguageModel`, `DatapipeForMaskedLanguageModel`|
+| sequence classification| [x] | `ExampleForSequenceClassification`, `DatasetForSequenceClassification`, `DatapipeForSequenceClassification`|
+| token classification | [x] | `ExampleForTokenClassification`, `DatasetForTokenClassification`, `DatapipeForTokenClassification`|
+| unsupervised simcse | [x] | `ExampleForUnsupervisedSimCSE`, `DatasetForUnsupervisedSimCSE`, `DatapipeForUnsupervisedSimCSE`|
+| supervised simcse | [x] | `ExampleForSupervisedSimCSE`, `DatasetForSupervisedSimCSE`, `DatapipeForSupervisedSimCSE`|
+| hard negative simcse | [x] | `ExampleForHardNegativeSimCSE`, `DatasetForHardNegativeSimCSE`, `DatapipeForHardNegativeSimCSE`|
+
+
+## Usage
+
+All datapipes for different tasks has the same interface.
+
+Here is an example for question answering task, but you can use datapipe the same way for other tasks.
+
+### Example for Question Answering
+
+```python
+
+from smile_datasets import DatasetForQuestionAnswering, DatapipeForQuestionAnswering
+
+# each line is a JSON {"sequece": "我喜欢自然语言处理(NLP)"}
+train_input_jsonl_files = ["data/train.jsonl"]
+train_dataset = DatapipeForQuestionAnswering.from_jsonl_files(
+ input_files=train_input_jsonl_files,
+ vocab_file="bert/vocab.txt",
+ batch_size=32,
+)
+
+# check dataset
+print(next(iter(train_dataset)))
+
+# model = build_keras_model(...)
+# model.compile(...)
+# train model
+model.fit(train_dataset, callbacks=[...])
+
+```
+
+
+For maximum flexibility, you can always subclass `DatasetForQuestionAnswering` to load your dataset, just like `torch.utils.data.Dataset`:
+
+```python
+from smile_datasets import DatasetForQuestionAnswering, DatapipeForQuestionAnswering, ParserForQuestionAnswering
+
+class DuReaderDatasetForQuestionAnswering(DatasetForQuestionAnswering):
+ """Dataset reader for DuReader dataset."""
+
+ def __init__(self, input_files, vocab_file, subset="rubost", **kwargs) -> None:
+ super().__init__()
+ self.parser = ParserForQuestionAnswering(tokenizer=None, vocab_file=vocab_file, **kwargs)
+ if subset == "rubost":
+ self.instances = list(readers.read_dureader_rubost(input_files, **kwargs))
+ else:
+ self.instances = list(readers.read_dureader_checklist(input_files, **kwargs))
+ self.examples = []
+ for instance in self.instances:
+ e = self.parser.parse(instance)
+ if not e:
+ continue
+ self.examples.append(e)
+
+ def __len__(self):
+ return len(self.examples)
+
+ def __getitem__(self, index) -> ExampleForQuestionAnswering:
+ return self.examples[index]
+
+
+dataset = DuReaderDatasetForQuestionAnswering(input_files=["data/trian.jsonl"], vocab_file="bert/vocab.txt")
+train_dataset = DatapipeForQuestionAnswering.from_dataset(dataset, batch_size=32)
+
+# check dataset
+print(next(iter(train_dataset)))
+
+# model = build_keras_model(...)
+# model.compile(...)
+# train model
+model.fit(train_dataset, callbacks=[...])
+```
+
+For better performance, you can convert `dataset` to `tfrecord` ahead of time, and then build datapipe from tfrecord files directly:
+
+```python
+# save dataset in tfrecord format
+dataset.save_tfrecord(output_files="data/train.tfrecord")
+
+# build datapipe from tfrecord files
+train_dataset = DatapipeForQuestionAnswering.from_tfrecord_files(input_files="data/train.tfrecord", batch_size=32)
+
+# check dataset
+print(next(iter(train_dataset)))
+
+# model = build_keras_model(...)
+# model.compile(...)
+# train model
+model.fit(train_dataset, callbacks=[...])
+```
+
+
+
+
+%package -n python3-smile-datasets
+Summary: La**S**t **mile** datasets: Use `tf.data` to solve the last mile data loading problem for tensorflow.
+Provides: python-smile-datasets
+BuildRequires: python3-devel
+BuildRequires: python3-setuptools
+BuildRequires: python3-pip
+%description -n python3-smile-datasets
+# smile-datasets
+
+![Python package](https://github.com/luozhouyang/smile-datasets/workflows/Python%20package/badge.svg)
+[![PyPI version](https://badge.fury.io/py/smile-datasets.svg)](https://badge.fury.io/py/smile-datasets)
+[![Python](https://img.shields.io/pypi/pyversions/smile-datasets.svg?style=plastic)](https://badge.fury.io/py/smile-datasets)
+
+
+La**S**t **mile** Datasets: Use `tf.data` to solve the last mile data loading problem for tensorflow.
+
+If you want to load public datasets, try:
+
+* [tensorflow/datasets](https://github.com/tensorflow/datasets)
+* [huggingface/datasets](https://github.com/huggingface/datasets)
+
+If you want to load local, personal dataset with minimized boilerplate, use **Smile Dataset**!
+
+## Support Matrix
+
+| task | supported | core abstractions |
+|:-----------------------|:-----------|:------------------|
+| question answering | [x] | `ExampleForQuestionAnswering`, `DatasetForQuestionAnswering`, `DatapipeForQuestionAnswering`|
+| masked language model | [x] | `ExampleForMaskedLanguageModel`, `DatasetForMaskedLanguageModel`, `DatapipeForMaskedLanguageModel`|
+| sequence classification| [x] | `ExampleForSequenceClassification`, `DatasetForSequenceClassification`, `DatapipeForSequenceClassification`|
+| token classification | [x] | `ExampleForTokenClassification`, `DatasetForTokenClassification`, `DatapipeForTokenClassification`|
+| unsupervised simcse | [x] | `ExampleForUnsupervisedSimCSE`, `DatasetForUnsupervisedSimCSE`, `DatapipeForUnsupervisedSimCSE`|
+| supervised simcse | [x] | `ExampleForSupervisedSimCSE`, `DatasetForSupervisedSimCSE`, `DatapipeForSupervisedSimCSE`|
+| hard negative simcse | [x] | `ExampleForHardNegativeSimCSE`, `DatasetForHardNegativeSimCSE`, `DatapipeForHardNegativeSimCSE`|
+
+
+## Usage
+
+All datapipes for different tasks has the same interface.
+
+Here is an example for question answering task, but you can use datapipe the same way for other tasks.
+
+### Example for Question Answering
+
+```python
+
+from smile_datasets import DatasetForQuestionAnswering, DatapipeForQuestionAnswering
+
+# each line is a JSON {"sequece": "我喜欢自然语言处理(NLP)"}
+train_input_jsonl_files = ["data/train.jsonl"]
+train_dataset = DatapipeForQuestionAnswering.from_jsonl_files(
+ input_files=train_input_jsonl_files,
+ vocab_file="bert/vocab.txt",
+ batch_size=32,
+)
+
+# check dataset
+print(next(iter(train_dataset)))
+
+# model = build_keras_model(...)
+# model.compile(...)
+# train model
+model.fit(train_dataset, callbacks=[...])
+
+```
+
+
+For maximum flexibility, you can always subclass `DatasetForQuestionAnswering` to load your dataset, just like `torch.utils.data.Dataset`:
+
+```python
+from smile_datasets import DatasetForQuestionAnswering, DatapipeForQuestionAnswering, ParserForQuestionAnswering
+
+class DuReaderDatasetForQuestionAnswering(DatasetForQuestionAnswering):
+ """Dataset reader for DuReader dataset."""
+
+ def __init__(self, input_files, vocab_file, subset="rubost", **kwargs) -> None:
+ super().__init__()
+ self.parser = ParserForQuestionAnswering(tokenizer=None, vocab_file=vocab_file, **kwargs)
+ if subset == "rubost":
+ self.instances = list(readers.read_dureader_rubost(input_files, **kwargs))
+ else:
+ self.instances = list(readers.read_dureader_checklist(input_files, **kwargs))
+ self.examples = []
+ for instance in self.instances:
+ e = self.parser.parse(instance)
+ if not e:
+ continue
+ self.examples.append(e)
+
+ def __len__(self):
+ return len(self.examples)
+
+ def __getitem__(self, index) -> ExampleForQuestionAnswering:
+ return self.examples[index]
+
+
+dataset = DuReaderDatasetForQuestionAnswering(input_files=["data/trian.jsonl"], vocab_file="bert/vocab.txt")
+train_dataset = DatapipeForQuestionAnswering.from_dataset(dataset, batch_size=32)
+
+# check dataset
+print(next(iter(train_dataset)))
+
+# model = build_keras_model(...)
+# model.compile(...)
+# train model
+model.fit(train_dataset, callbacks=[...])
+```
+
+For better performance, you can convert `dataset` to `tfrecord` ahead of time, and then build datapipe from tfrecord files directly:
+
+```python
+# save dataset in tfrecord format
+dataset.save_tfrecord(output_files="data/train.tfrecord")
+
+# build datapipe from tfrecord files
+train_dataset = DatapipeForQuestionAnswering.from_tfrecord_files(input_files="data/train.tfrecord", batch_size=32)
+
+# check dataset
+print(next(iter(train_dataset)))
+
+# model = build_keras_model(...)
+# model.compile(...)
+# train model
+model.fit(train_dataset, callbacks=[...])
+```
+
+
+
+
+%package help
+Summary: Development documents and examples for smile-datasets
+Provides: python3-smile-datasets-doc
+%description help
+# smile-datasets
+
+![Python package](https://github.com/luozhouyang/smile-datasets/workflows/Python%20package/badge.svg)
+[![PyPI version](https://badge.fury.io/py/smile-datasets.svg)](https://badge.fury.io/py/smile-datasets)
+[![Python](https://img.shields.io/pypi/pyversions/smile-datasets.svg?style=plastic)](https://badge.fury.io/py/smile-datasets)
+
+
+La**S**t **mile** Datasets: Use `tf.data` to solve the last mile data loading problem for tensorflow.
+
+If you want to load public datasets, try:
+
+* [tensorflow/datasets](https://github.com/tensorflow/datasets)
+* [huggingface/datasets](https://github.com/huggingface/datasets)
+
+If you want to load local, personal dataset with minimized boilerplate, use **Smile Dataset**!
+
+## Support Matrix
+
+| task | supported | core abstractions |
+|:-----------------------|:-----------|:------------------|
+| question answering | [x] | `ExampleForQuestionAnswering`, `DatasetForQuestionAnswering`, `DatapipeForQuestionAnswering`|
+| masked language model | [x] | `ExampleForMaskedLanguageModel`, `DatasetForMaskedLanguageModel`, `DatapipeForMaskedLanguageModel`|
+| sequence classification| [x] | `ExampleForSequenceClassification`, `DatasetForSequenceClassification`, `DatapipeForSequenceClassification`|
+| token classification | [x] | `ExampleForTokenClassification`, `DatasetForTokenClassification`, `DatapipeForTokenClassification`|
+| unsupervised simcse | [x] | `ExampleForUnsupervisedSimCSE`, `DatasetForUnsupervisedSimCSE`, `DatapipeForUnsupervisedSimCSE`|
+| supervised simcse | [x] | `ExampleForSupervisedSimCSE`, `DatasetForSupervisedSimCSE`, `DatapipeForSupervisedSimCSE`|
+| hard negative simcse | [x] | `ExampleForHardNegativeSimCSE`, `DatasetForHardNegativeSimCSE`, `DatapipeForHardNegativeSimCSE`|
+
+
+## Usage
+
+All datapipes for different tasks has the same interface.
+
+Here is an example for question answering task, but you can use datapipe the same way for other tasks.
+
+### Example for Question Answering
+
+```python
+
+from smile_datasets import DatasetForQuestionAnswering, DatapipeForQuestionAnswering
+
+# each line is a JSON {"sequece": "我喜欢自然语言处理(NLP)"}
+train_input_jsonl_files = ["data/train.jsonl"]
+train_dataset = DatapipeForQuestionAnswering.from_jsonl_files(
+ input_files=train_input_jsonl_files,
+ vocab_file="bert/vocab.txt",
+ batch_size=32,
+)
+
+# check dataset
+print(next(iter(train_dataset)))
+
+# model = build_keras_model(...)
+# model.compile(...)
+# train model
+model.fit(train_dataset, callbacks=[...])
+
+```
+
+
+For maximum flexibility, you can always subclass `DatasetForQuestionAnswering` to load your dataset, just like `torch.utils.data.Dataset`:
+
+```python
+from smile_datasets import DatasetForQuestionAnswering, DatapipeForQuestionAnswering, ParserForQuestionAnswering
+
+class DuReaderDatasetForQuestionAnswering(DatasetForQuestionAnswering):
+ """Dataset reader for DuReader dataset."""
+
+ def __init__(self, input_files, vocab_file, subset="rubost", **kwargs) -> None:
+ super().__init__()
+ self.parser = ParserForQuestionAnswering(tokenizer=None, vocab_file=vocab_file, **kwargs)
+ if subset == "rubost":
+ self.instances = list(readers.read_dureader_rubost(input_files, **kwargs))
+ else:
+ self.instances = list(readers.read_dureader_checklist(input_files, **kwargs))
+ self.examples = []
+ for instance in self.instances:
+ e = self.parser.parse(instance)
+ if not e:
+ continue
+ self.examples.append(e)
+
+ def __len__(self):
+ return len(self.examples)
+
+ def __getitem__(self, index) -> ExampleForQuestionAnswering:
+ return self.examples[index]
+
+
+dataset = DuReaderDatasetForQuestionAnswering(input_files=["data/trian.jsonl"], vocab_file="bert/vocab.txt")
+train_dataset = DatapipeForQuestionAnswering.from_dataset(dataset, batch_size=32)
+
+# check dataset
+print(next(iter(train_dataset)))
+
+# model = build_keras_model(...)
+# model.compile(...)
+# train model
+model.fit(train_dataset, callbacks=[...])
+```
+
+For better performance, you can convert `dataset` to `tfrecord` ahead of time, and then build datapipe from tfrecord files directly:
+
+```python
+# save dataset in tfrecord format
+dataset.save_tfrecord(output_files="data/train.tfrecord")
+
+# build datapipe from tfrecord files
+train_dataset = DatapipeForQuestionAnswering.from_tfrecord_files(input_files="data/train.tfrecord", batch_size=32)
+
+# check dataset
+print(next(iter(train_dataset)))
+
+# model = build_keras_model(...)
+# model.compile(...)
+# train model
+model.fit(train_dataset, callbacks=[...])
+```
+
+
+
+
+%prep
+%autosetup -n smile-datasets-0.0.6
+
+%build
+%py3_build
+
+%install
+%py3_install
+install -d -m755 %{buildroot}/%{_pkgdocdir}
+if [ -d doc ]; then cp -arf doc %{buildroot}/%{_pkgdocdir}; fi
+if [ -d docs ]; then cp -arf docs %{buildroot}/%{_pkgdocdir}; fi
+if [ -d example ]; then cp -arf example %{buildroot}/%{_pkgdocdir}; fi
+if [ -d examples ]; then cp -arf examples %{buildroot}/%{_pkgdocdir}; fi
+pushd %{buildroot}
+if [ -d usr/lib ]; then
+ find usr/lib -type f -printf "\"/%h/%f\"\n" >> filelist.lst
+fi
+if [ -d usr/lib64 ]; then
+ find usr/lib64 -type f -printf "\"/%h/%f\"\n" >> filelist.lst
+fi
+if [ -d usr/bin ]; then
+ find usr/bin -type f -printf "\"/%h/%f\"\n" >> filelist.lst
+fi
+if [ -d usr/sbin ]; then
+ find usr/sbin -type f -printf "\"/%h/%f\"\n" >> filelist.lst
+fi
+touch doclist.lst
+if [ -d usr/share/man ]; then
+ find usr/share/man -type f -printf "\"/%h/%f.gz\"\n" >> doclist.lst
+fi
+popd
+mv %{buildroot}/filelist.lst .
+mv %{buildroot}/doclist.lst .
+
+%files -n python3-smile-datasets -f filelist.lst
+%dir %{python3_sitelib}/*
+
+%files help -f doclist.lst
+%{_docdir}/*
+
+%changelog
+* Tue Jun 20 2023 Python_Bot <Python_Bot@openeuler.org> - 0.0.6-1
+- Package Spec generated
diff --git a/sources b/sources
new file mode 100644
index 0000000..013c95e
--- /dev/null
+++ b/sources
@@ -0,0 +1 @@
+a8d77df596e6fcf240bbb5edacf783fa smile-datasets-0.0.6.tar.gz