diff options
author | CoprDistGit <infra@openeuler.org> | 2023-06-20 04:19:03 +0000 |
---|---|---|
committer | CoprDistGit <infra@openeuler.org> | 2023-06-20 04:19:03 +0000 |
commit | cda12032f7d4bd9edc3104db3f1cc8117f9d9cf0 (patch) | |
tree | 0da3de603ef1f600c3651e40f9db01ed76af1246 | |
parent | 84a0020c603a8c2c227a05a380a355b5e4d51bd7 (diff) |
automatic import of python-smile-datasetsopeneuler20.03
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | python-smile-datasets.spec | 434 | ||||
-rw-r--r-- | sources | 1 |
3 files changed, 436 insertions, 0 deletions
@@ -0,0 +1 @@ +/smile-datasets-0.0.6.tar.gz diff --git a/python-smile-datasets.spec b/python-smile-datasets.spec new file mode 100644 index 0000000..991ed74 --- /dev/null +++ b/python-smile-datasets.spec @@ -0,0 +1,434 @@ +%global _empty_manifest_terminate_build 0 +Name: python-smile-datasets +Version: 0.0.6 +Release: 1 +Summary: La**S**t **mile** datasets: Use `tf.data` to solve the last mile data loading problem for tensorflow. +License: Apache Software License +URL: https://github.com/luozhouyang/smile-datasets +Source0: https://mirrors.aliyun.com/pypi/web/packages/72/28/bdee1d8fadf99f99daea01b0a09546932871fb995e82838c6c59e5c94ffb/smile-datasets-0.0.6.tar.gz +BuildArch: noarch + +Requires: python3-tokenizers +Requires: python3-tensorflow + +%description +# smile-datasets + + +[](https://badge.fury.io/py/smile-datasets) +[](https://badge.fury.io/py/smile-datasets) + + +La**S**t **mile** Datasets: Use `tf.data` to solve the last mile data loading problem for tensorflow. + +If you want to load public datasets, try: + +* [tensorflow/datasets](https://github.com/tensorflow/datasets) +* [huggingface/datasets](https://github.com/huggingface/datasets) + +If you want to load local, personal dataset with minimized boilerplate, use **Smile Dataset**! + +## Support Matrix + +| task | supported | core abstractions | +|:-----------------------|:-----------|:------------------| +| question answering | [x] | `ExampleForQuestionAnswering`, `DatasetForQuestionAnswering`, `DatapipeForQuestionAnswering`| +| masked language model | [x] | `ExampleForMaskedLanguageModel`, `DatasetForMaskedLanguageModel`, `DatapipeForMaskedLanguageModel`| +| sequence classification| [x] | `ExampleForSequenceClassification`, `DatasetForSequenceClassification`, `DatapipeForSequenceClassification`| +| token classification | [x] | `ExampleForTokenClassification`, `DatasetForTokenClassification`, `DatapipeForTokenClassification`| +| unsupervised simcse | [x] | `ExampleForUnsupervisedSimCSE`, `DatasetForUnsupervisedSimCSE`, `DatapipeForUnsupervisedSimCSE`| +| supervised simcse | [x] | `ExampleForSupervisedSimCSE`, `DatasetForSupervisedSimCSE`, `DatapipeForSupervisedSimCSE`| +| hard negative simcse | [x] | `ExampleForHardNegativeSimCSE`, `DatasetForHardNegativeSimCSE`, `DatapipeForHardNegativeSimCSE`| + + +## Usage + +All datapipes for different tasks has the same interface. + +Here is an example for question answering task, but you can use datapipe the same way for other tasks. + +### Example for Question Answering + +```python + +from smile_datasets import DatasetForQuestionAnswering, DatapipeForQuestionAnswering + +# each line is a JSON {"sequece": "我喜欢自然语言处理(NLP)"} +train_input_jsonl_files = ["data/train.jsonl"] +train_dataset = DatapipeForQuestionAnswering.from_jsonl_files( + input_files=train_input_jsonl_files, + vocab_file="bert/vocab.txt", + batch_size=32, +) + +# check dataset +print(next(iter(train_dataset))) + +# model = build_keras_model(...) +# model.compile(...) +# train model +model.fit(train_dataset, callbacks=[...]) + +``` + + +For maximum flexibility, you can always subclass `DatasetForQuestionAnswering` to load your dataset, just like `torch.utils.data.Dataset`: + +```python +from smile_datasets import DatasetForQuestionAnswering, DatapipeForQuestionAnswering, ParserForQuestionAnswering + +class DuReaderDatasetForQuestionAnswering(DatasetForQuestionAnswering): + """Dataset reader for DuReader dataset.""" + + def __init__(self, input_files, vocab_file, subset="rubost", **kwargs) -> None: + super().__init__() + self.parser = ParserForQuestionAnswering(tokenizer=None, vocab_file=vocab_file, **kwargs) + if subset == "rubost": + self.instances = list(readers.read_dureader_rubost(input_files, **kwargs)) + else: + self.instances = list(readers.read_dureader_checklist(input_files, **kwargs)) + self.examples = [] + for instance in self.instances: + e = self.parser.parse(instance) + if not e: + continue + self.examples.append(e) + + def __len__(self): + return len(self.examples) + + def __getitem__(self, index) -> ExampleForQuestionAnswering: + return self.examples[index] + + +dataset = DuReaderDatasetForQuestionAnswering(input_files=["data/trian.jsonl"], vocab_file="bert/vocab.txt") +train_dataset = DatapipeForQuestionAnswering.from_dataset(dataset, batch_size=32) + +# check dataset +print(next(iter(train_dataset))) + +# model = build_keras_model(...) +# model.compile(...) +# train model +model.fit(train_dataset, callbacks=[...]) +``` + +For better performance, you can convert `dataset` to `tfrecord` ahead of time, and then build datapipe from tfrecord files directly: + +```python +# save dataset in tfrecord format +dataset.save_tfrecord(output_files="data/train.tfrecord") + +# build datapipe from tfrecord files +train_dataset = DatapipeForQuestionAnswering.from_tfrecord_files(input_files="data/train.tfrecord", batch_size=32) + +# check dataset +print(next(iter(train_dataset))) + +# model = build_keras_model(...) +# model.compile(...) +# train model +model.fit(train_dataset, callbacks=[...]) +``` + + + + +%package -n python3-smile-datasets +Summary: La**S**t **mile** datasets: Use `tf.data` to solve the last mile data loading problem for tensorflow. +Provides: python-smile-datasets +BuildRequires: python3-devel +BuildRequires: python3-setuptools +BuildRequires: python3-pip +%description -n python3-smile-datasets +# smile-datasets + + +[](https://badge.fury.io/py/smile-datasets) +[](https://badge.fury.io/py/smile-datasets) + + +La**S**t **mile** Datasets: Use `tf.data` to solve the last mile data loading problem for tensorflow. + +If you want to load public datasets, try: + +* [tensorflow/datasets](https://github.com/tensorflow/datasets) +* [huggingface/datasets](https://github.com/huggingface/datasets) + +If you want to load local, personal dataset with minimized boilerplate, use **Smile Dataset**! + +## Support Matrix + +| task | supported | core abstractions | +|:-----------------------|:-----------|:------------------| +| question answering | [x] | `ExampleForQuestionAnswering`, `DatasetForQuestionAnswering`, `DatapipeForQuestionAnswering`| +| masked language model | [x] | `ExampleForMaskedLanguageModel`, `DatasetForMaskedLanguageModel`, `DatapipeForMaskedLanguageModel`| +| sequence classification| [x] | `ExampleForSequenceClassification`, `DatasetForSequenceClassification`, `DatapipeForSequenceClassification`| +| token classification | [x] | `ExampleForTokenClassification`, `DatasetForTokenClassification`, `DatapipeForTokenClassification`| +| unsupervised simcse | [x] | `ExampleForUnsupervisedSimCSE`, `DatasetForUnsupervisedSimCSE`, `DatapipeForUnsupervisedSimCSE`| +| supervised simcse | [x] | `ExampleForSupervisedSimCSE`, `DatasetForSupervisedSimCSE`, `DatapipeForSupervisedSimCSE`| +| hard negative simcse | [x] | `ExampleForHardNegativeSimCSE`, `DatasetForHardNegativeSimCSE`, `DatapipeForHardNegativeSimCSE`| + + +## Usage + +All datapipes for different tasks has the same interface. + +Here is an example for question answering task, but you can use datapipe the same way for other tasks. + +### Example for Question Answering + +```python + +from smile_datasets import DatasetForQuestionAnswering, DatapipeForQuestionAnswering + +# each line is a JSON {"sequece": "我喜欢自然语言处理(NLP)"} +train_input_jsonl_files = ["data/train.jsonl"] +train_dataset = DatapipeForQuestionAnswering.from_jsonl_files( + input_files=train_input_jsonl_files, + vocab_file="bert/vocab.txt", + batch_size=32, +) + +# check dataset +print(next(iter(train_dataset))) + +# model = build_keras_model(...) +# model.compile(...) +# train model +model.fit(train_dataset, callbacks=[...]) + +``` + + +For maximum flexibility, you can always subclass `DatasetForQuestionAnswering` to load your dataset, just like `torch.utils.data.Dataset`: + +```python +from smile_datasets import DatasetForQuestionAnswering, DatapipeForQuestionAnswering, ParserForQuestionAnswering + +class DuReaderDatasetForQuestionAnswering(DatasetForQuestionAnswering): + """Dataset reader for DuReader dataset.""" + + def __init__(self, input_files, vocab_file, subset="rubost", **kwargs) -> None: + super().__init__() + self.parser = ParserForQuestionAnswering(tokenizer=None, vocab_file=vocab_file, **kwargs) + if subset == "rubost": + self.instances = list(readers.read_dureader_rubost(input_files, **kwargs)) + else: + self.instances = list(readers.read_dureader_checklist(input_files, **kwargs)) + self.examples = [] + for instance in self.instances: + e = self.parser.parse(instance) + if not e: + continue + self.examples.append(e) + + def __len__(self): + return len(self.examples) + + def __getitem__(self, index) -> ExampleForQuestionAnswering: + return self.examples[index] + + +dataset = DuReaderDatasetForQuestionAnswering(input_files=["data/trian.jsonl"], vocab_file="bert/vocab.txt") +train_dataset = DatapipeForQuestionAnswering.from_dataset(dataset, batch_size=32) + +# check dataset +print(next(iter(train_dataset))) + +# model = build_keras_model(...) +# model.compile(...) +# train model +model.fit(train_dataset, callbacks=[...]) +``` + +For better performance, you can convert `dataset` to `tfrecord` ahead of time, and then build datapipe from tfrecord files directly: + +```python +# save dataset in tfrecord format +dataset.save_tfrecord(output_files="data/train.tfrecord") + +# build datapipe from tfrecord files +train_dataset = DatapipeForQuestionAnswering.from_tfrecord_files(input_files="data/train.tfrecord", batch_size=32) + +# check dataset +print(next(iter(train_dataset))) + +# model = build_keras_model(...) +# model.compile(...) +# train model +model.fit(train_dataset, callbacks=[...]) +``` + + + + +%package help +Summary: Development documents and examples for smile-datasets +Provides: python3-smile-datasets-doc +%description help +# smile-datasets + + +[](https://badge.fury.io/py/smile-datasets) +[](https://badge.fury.io/py/smile-datasets) + + +La**S**t **mile** Datasets: Use `tf.data` to solve the last mile data loading problem for tensorflow. + +If you want to load public datasets, try: + +* [tensorflow/datasets](https://github.com/tensorflow/datasets) +* [huggingface/datasets](https://github.com/huggingface/datasets) + +If you want to load local, personal dataset with minimized boilerplate, use **Smile Dataset**! + +## Support Matrix + +| task | supported | core abstractions | +|:-----------------------|:-----------|:------------------| +| question answering | [x] | `ExampleForQuestionAnswering`, `DatasetForQuestionAnswering`, `DatapipeForQuestionAnswering`| +| masked language model | [x] | `ExampleForMaskedLanguageModel`, `DatasetForMaskedLanguageModel`, `DatapipeForMaskedLanguageModel`| +| sequence classification| [x] | `ExampleForSequenceClassification`, `DatasetForSequenceClassification`, `DatapipeForSequenceClassification`| +| token classification | [x] | `ExampleForTokenClassification`, `DatasetForTokenClassification`, `DatapipeForTokenClassification`| +| unsupervised simcse | [x] | `ExampleForUnsupervisedSimCSE`, `DatasetForUnsupervisedSimCSE`, `DatapipeForUnsupervisedSimCSE`| +| supervised simcse | [x] | `ExampleForSupervisedSimCSE`, `DatasetForSupervisedSimCSE`, `DatapipeForSupervisedSimCSE`| +| hard negative simcse | [x] | `ExampleForHardNegativeSimCSE`, `DatasetForHardNegativeSimCSE`, `DatapipeForHardNegativeSimCSE`| + + +## Usage + +All datapipes for different tasks has the same interface. + +Here is an example for question answering task, but you can use datapipe the same way for other tasks. + +### Example for Question Answering + +```python + +from smile_datasets import DatasetForQuestionAnswering, DatapipeForQuestionAnswering + +# each line is a JSON {"sequece": "我喜欢自然语言处理(NLP)"} +train_input_jsonl_files = ["data/train.jsonl"] +train_dataset = DatapipeForQuestionAnswering.from_jsonl_files( + input_files=train_input_jsonl_files, + vocab_file="bert/vocab.txt", + batch_size=32, +) + +# check dataset +print(next(iter(train_dataset))) + +# model = build_keras_model(...) +# model.compile(...) +# train model +model.fit(train_dataset, callbacks=[...]) + +``` + + +For maximum flexibility, you can always subclass `DatasetForQuestionAnswering` to load your dataset, just like `torch.utils.data.Dataset`: + +```python +from smile_datasets import DatasetForQuestionAnswering, DatapipeForQuestionAnswering, ParserForQuestionAnswering + +class DuReaderDatasetForQuestionAnswering(DatasetForQuestionAnswering): + """Dataset reader for DuReader dataset.""" + + def __init__(self, input_files, vocab_file, subset="rubost", **kwargs) -> None: + super().__init__() + self.parser = ParserForQuestionAnswering(tokenizer=None, vocab_file=vocab_file, **kwargs) + if subset == "rubost": + self.instances = list(readers.read_dureader_rubost(input_files, **kwargs)) + else: + self.instances = list(readers.read_dureader_checklist(input_files, **kwargs)) + self.examples = [] + for instance in self.instances: + e = self.parser.parse(instance) + if not e: + continue + self.examples.append(e) + + def __len__(self): + return len(self.examples) + + def __getitem__(self, index) -> ExampleForQuestionAnswering: + return self.examples[index] + + +dataset = DuReaderDatasetForQuestionAnswering(input_files=["data/trian.jsonl"], vocab_file="bert/vocab.txt") +train_dataset = DatapipeForQuestionAnswering.from_dataset(dataset, batch_size=32) + +# check dataset +print(next(iter(train_dataset))) + +# model = build_keras_model(...) +# model.compile(...) +# train model +model.fit(train_dataset, callbacks=[...]) +``` + +For better performance, you can convert `dataset` to `tfrecord` ahead of time, and then build datapipe from tfrecord files directly: + +```python +# save dataset in tfrecord format +dataset.save_tfrecord(output_files="data/train.tfrecord") + +# build datapipe from tfrecord files +train_dataset = DatapipeForQuestionAnswering.from_tfrecord_files(input_files="data/train.tfrecord", batch_size=32) + +# check dataset +print(next(iter(train_dataset))) + +# model = build_keras_model(...) +# model.compile(...) +# train model +model.fit(train_dataset, callbacks=[...]) +``` + + + + +%prep +%autosetup -n smile-datasets-0.0.6 + +%build +%py3_build + +%install +%py3_install +install -d -m755 %{buildroot}/%{_pkgdocdir} +if [ -d doc ]; then cp -arf doc %{buildroot}/%{_pkgdocdir}; fi +if [ -d docs ]; then cp -arf docs %{buildroot}/%{_pkgdocdir}; fi +if [ -d example ]; then cp -arf example %{buildroot}/%{_pkgdocdir}; fi +if [ -d examples ]; then cp -arf examples %{buildroot}/%{_pkgdocdir}; fi +pushd %{buildroot} +if [ -d usr/lib ]; then + find usr/lib -type f -printf "\"/%h/%f\"\n" >> filelist.lst +fi +if [ -d usr/lib64 ]; then + find usr/lib64 -type f -printf "\"/%h/%f\"\n" >> filelist.lst +fi +if [ -d usr/bin ]; then + find usr/bin -type f -printf "\"/%h/%f\"\n" >> filelist.lst +fi +if [ -d usr/sbin ]; then + find usr/sbin -type f -printf "\"/%h/%f\"\n" >> filelist.lst +fi +touch doclist.lst +if [ -d usr/share/man ]; then + find usr/share/man -type f -printf "\"/%h/%f.gz\"\n" >> doclist.lst +fi +popd +mv %{buildroot}/filelist.lst . +mv %{buildroot}/doclist.lst . + +%files -n python3-smile-datasets -f filelist.lst +%dir %{python3_sitelib}/* + +%files help -f doclist.lst +%{_docdir}/* + +%changelog +* Tue Jun 20 2023 Python_Bot <Python_Bot@openeuler.org> - 0.0.6-1 +- Package Spec generated @@ -0,0 +1 @@ +a8d77df596e6fcf240bbb5edacf783fa smile-datasets-0.0.6.tar.gz |