diff options
Diffstat (limited to 'python-lazy-dataset.spec')
-rw-r--r-- | python-lazy-dataset.spec | 402 |
1 files changed, 402 insertions, 0 deletions
diff --git a/python-lazy-dataset.spec b/python-lazy-dataset.spec new file mode 100644 index 0000000..72be5b7 --- /dev/null +++ b/python-lazy-dataset.spec @@ -0,0 +1,402 @@ +%global _empty_manifest_terminate_build 0 +Name: python-lazy-dataset +Version: 0.0.14 +Release: 1 +Summary: Process large datasets as if it was an iterable. +License: MIT +URL: https://github.com/fgnt/lazy_dataset +Source0: https://mirrors.aliyun.com/pypi/web/packages/43/e6/5eaa9245879ef9346c2f81af5a47f93322f4f8013e5ed1142496fb15492d/lazy_dataset-0.0.14.tar.gz +BuildArch: noarch + +Requires: python3-numpy +Requires: python3-humanfriendly +Requires: python3-mock +Requires: python3-diskcache +Requires: python3-psutil +Requires: python3-humanfriendly +Requires: python3-psutil +Requires: python3-diskcache +Requires: python3-mock + +%description + +# lazy_dataset + +[](https://travis-ci.org/fgnt/lazy_dataset) + +[](https://codecov.io/github/fgnt/lazy_dataset?branch=master) +[](https://github.com/fgnt/lazy_dataset/blob/master/LICENSE) + +Lazy_dataset is a helper to deal with large datasets that do not fit into memory. +It allows to define transformations that are applied lazily, +(e.g. a mapping function to read data from HDD). When someone iterates over the dataset all +transformations are applied. + +Supported transformations: + - `dataset.map(map_fn)`: Apply the function `map_fn` to each example ([builtins.map](https://docs.python.org/3/library/functions.html#map)) + - `dataset[2]`: Get example at index `2`. + - `dataset['example_id']` Get that example that has the example id `'example_id'`. + - `dataset[10:20]`: Get a sub dataset that contains only the examples in the slice 10 to 20. + - `dataset.filter(filter_fn, lazy=True)` Drops examples where `filter_fn(example)` is false ([builtins.filter](https://docs.python.org/3/library/functions.html#filter)). + - `dataset.concatenate(*others)`: Concatenates two or more datasets ([numpy.concatenate](https://docs.scipy.org/doc/numpy-1.14.0/reference/generated/numpy.concatenate.html)) + - `dataset.intersperse(*others)`: Combine two or more datasets such that examples of each input dataset are evenly spaced (https://stackoverflow.com/a/19293603). + - `dataset.zip(*others)`: Zip two or more datasets + - `dataset.shuffle(reshuffle=False)`: Shuffles the dataset. When `reshuffle` is `True` it shuffles each time when you iterate over the data. + - `dataset.tile(reps, shuffle=False)`: Repeats the dataset `reps` times and concatenates it ([numpy.tile](https://docs.scipy.org/doc/numpy/reference/generated/numpy.tile.html)) + - `dataset.groupby(group_fn)`: Groups examples together. In contrast to `itertools.groupby` a sort is not nessesary, like in pandas ([itertools.groupby](https://docs.python.org/3/library/itertools.html#itertools.groupby), [pandas.DataFrame.groupby](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.groupby.html)) + - `dataset.sort(key_fn, sort_fn=sorted)`: Sorts the examples depending on the values `key_fn(example)` ([list.sort](https://docs.python.org/3/library/stdtypes.html#list.sort)) + - `dataset.batch(batch_size, drop_last=False)`: Batches `batch_size` examples together as a list. Usually followed by a map ([tensorflow.data.Dataset.batch](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#batch)) + - `dataset.random_choice()`: Get a random example ([numpy.random.choice](https://docs.scipy.org/doc/numpy/reference/generated/numpy.random.choice.html)) + - `dataset.cache()`: Cache in RAM (similar to ESPnet's `keep_all_data_on_mem`) + - `dataset.diskcache()`: Cache to a cache directory on the local filesystem (useful in clusters network slow filesystems) + - ... + + +```python +>>> from IPython.lib.pretty import pprint +>>> import lazy_dataset +>>> examples = { +... 'example_id_1': { +... 'observation': [1, 2, 3], +... 'label': 1, +... }, +... 'example_id_2': { +... 'observation': [4, 5, 6], +... 'label': 2, +... }, +... 'example_id_3': { +... 'observation': [7, 8, 9], +... 'label': 3, +... }, +... } +>>> for example_id, example in examples.items(): +... example['example_id'] = example_id +>>> ds = lazy_dataset.new(examples) +>>> ds + DictDataset(len=3) +MapDataset(_pickle.loads) +>>> ds.keys() +('example_id_1', 'example_id_2', 'example_id_3') +>>> for example in ds: +... print(example) +{'observation': [1, 2, 3], 'label': 1, 'example_id': 'example_id_1'} +{'observation': [4, 5, 6], 'label': 2, 'example_id': 'example_id_2'} +{'observation': [7, 8, 9], 'label': 3, 'example_id': 'example_id_3'} +>>> def transform(example): +... example['label'] *= 10 +... return example +>>> ds = ds.map(transform) +>>> for example in ds: +... print(example) +{'observation': [1, 2, 3], 'label': 10, 'example_id': 'example_id_1'} +{'observation': [4, 5, 6], 'label': 20, 'example_id': 'example_id_2'} +{'observation': [7, 8, 9], 'label': 30, 'example_id': 'example_id_3'} +>>> ds = ds.filter(lambda example: example['label'] > 15) +>>> for example in ds: +... print(example) +{'observation': [4, 5, 6], 'label': 20, 'example_id': 'example_id_2'} +{'observation': [7, 8, 9], 'label': 30, 'example_id': 'example_id_3'} +>>> ds['example_id_2'] +{'observation': [4, 5, 6], 'label': 20, 'example_id': 'example_id_2'} +>>> ds + DictDataset(len=3) + MapDataset(_pickle.loads) + MapDataset(<function transform at 0x7ff74efb6620>) +FilterDataset(<function <lambda> at 0x7ff74efb67b8>) +``` + +## Comparison with PyTorch's DataLoader + +See [here](comparison/comparison.md) for a feature and throughput comparison of lazy_dataset with PyTorch's DataLoader. + +## Installation + +Install it directly with Pip, if you just want to use it: + +```bash +pip install lazy_dataset +``` + +If you want to make changes or want the most recent version: Clone the repository and install it as follows: + +```bash +git clone https://github.com/fgnt/lazy_dataset.git +cd lazy_dataset +pip install --editable . +``` + + + + +%package -n python3-lazy-dataset +Summary: Process large datasets as if it was an iterable. +Provides: python-lazy-dataset +BuildRequires: python3-devel +BuildRequires: python3-setuptools +BuildRequires: python3-pip +%description -n python3-lazy-dataset + +# lazy_dataset + +[](https://travis-ci.org/fgnt/lazy_dataset) + +[](https://codecov.io/github/fgnt/lazy_dataset?branch=master) +[](https://github.com/fgnt/lazy_dataset/blob/master/LICENSE) + +Lazy_dataset is a helper to deal with large datasets that do not fit into memory. +It allows to define transformations that are applied lazily, +(e.g. a mapping function to read data from HDD). When someone iterates over the dataset all +transformations are applied. + +Supported transformations: + - `dataset.map(map_fn)`: Apply the function `map_fn` to each example ([builtins.map](https://docs.python.org/3/library/functions.html#map)) + - `dataset[2]`: Get example at index `2`. + - `dataset['example_id']` Get that example that has the example id `'example_id'`. + - `dataset[10:20]`: Get a sub dataset that contains only the examples in the slice 10 to 20. + - `dataset.filter(filter_fn, lazy=True)` Drops examples where `filter_fn(example)` is false ([builtins.filter](https://docs.python.org/3/library/functions.html#filter)). + - `dataset.concatenate(*others)`: Concatenates two or more datasets ([numpy.concatenate](https://docs.scipy.org/doc/numpy-1.14.0/reference/generated/numpy.concatenate.html)) + - `dataset.intersperse(*others)`: Combine two or more datasets such that examples of each input dataset are evenly spaced (https://stackoverflow.com/a/19293603). + - `dataset.zip(*others)`: Zip two or more datasets + - `dataset.shuffle(reshuffle=False)`: Shuffles the dataset. When `reshuffle` is `True` it shuffles each time when you iterate over the data. + - `dataset.tile(reps, shuffle=False)`: Repeats the dataset `reps` times and concatenates it ([numpy.tile](https://docs.scipy.org/doc/numpy/reference/generated/numpy.tile.html)) + - `dataset.groupby(group_fn)`: Groups examples together. In contrast to `itertools.groupby` a sort is not nessesary, like in pandas ([itertools.groupby](https://docs.python.org/3/library/itertools.html#itertools.groupby), [pandas.DataFrame.groupby](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.groupby.html)) + - `dataset.sort(key_fn, sort_fn=sorted)`: Sorts the examples depending on the values `key_fn(example)` ([list.sort](https://docs.python.org/3/library/stdtypes.html#list.sort)) + - `dataset.batch(batch_size, drop_last=False)`: Batches `batch_size` examples together as a list. Usually followed by a map ([tensorflow.data.Dataset.batch](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#batch)) + - `dataset.random_choice()`: Get a random example ([numpy.random.choice](https://docs.scipy.org/doc/numpy/reference/generated/numpy.random.choice.html)) + - `dataset.cache()`: Cache in RAM (similar to ESPnet's `keep_all_data_on_mem`) + - `dataset.diskcache()`: Cache to a cache directory on the local filesystem (useful in clusters network slow filesystems) + - ... + + +```python +>>> from IPython.lib.pretty import pprint +>>> import lazy_dataset +>>> examples = { +... 'example_id_1': { +... 'observation': [1, 2, 3], +... 'label': 1, +... }, +... 'example_id_2': { +... 'observation': [4, 5, 6], +... 'label': 2, +... }, +... 'example_id_3': { +... 'observation': [7, 8, 9], +... 'label': 3, +... }, +... } +>>> for example_id, example in examples.items(): +... example['example_id'] = example_id +>>> ds = lazy_dataset.new(examples) +>>> ds + DictDataset(len=3) +MapDataset(_pickle.loads) +>>> ds.keys() +('example_id_1', 'example_id_2', 'example_id_3') +>>> for example in ds: +... print(example) +{'observation': [1, 2, 3], 'label': 1, 'example_id': 'example_id_1'} +{'observation': [4, 5, 6], 'label': 2, 'example_id': 'example_id_2'} +{'observation': [7, 8, 9], 'label': 3, 'example_id': 'example_id_3'} +>>> def transform(example): +... example['label'] *= 10 +... return example +>>> ds = ds.map(transform) +>>> for example in ds: +... print(example) +{'observation': [1, 2, 3], 'label': 10, 'example_id': 'example_id_1'} +{'observation': [4, 5, 6], 'label': 20, 'example_id': 'example_id_2'} +{'observation': [7, 8, 9], 'label': 30, 'example_id': 'example_id_3'} +>>> ds = ds.filter(lambda example: example['label'] > 15) +>>> for example in ds: +... print(example) +{'observation': [4, 5, 6], 'label': 20, 'example_id': 'example_id_2'} +{'observation': [7, 8, 9], 'label': 30, 'example_id': 'example_id_3'} +>>> ds['example_id_2'] +{'observation': [4, 5, 6], 'label': 20, 'example_id': 'example_id_2'} +>>> ds + DictDataset(len=3) + MapDataset(_pickle.loads) + MapDataset(<function transform at 0x7ff74efb6620>) +FilterDataset(<function <lambda> at 0x7ff74efb67b8>) +``` + +## Comparison with PyTorch's DataLoader + +See [here](comparison/comparison.md) for a feature and throughput comparison of lazy_dataset with PyTorch's DataLoader. + +## Installation + +Install it directly with Pip, if you just want to use it: + +```bash +pip install lazy_dataset +``` + +If you want to make changes or want the most recent version: Clone the repository and install it as follows: + +```bash +git clone https://github.com/fgnt/lazy_dataset.git +cd lazy_dataset +pip install --editable . +``` + + + + +%package help +Summary: Development documents and examples for lazy-dataset +Provides: python3-lazy-dataset-doc +%description help + +# lazy_dataset + +[](https://travis-ci.org/fgnt/lazy_dataset) + +[](https://codecov.io/github/fgnt/lazy_dataset?branch=master) +[](https://github.com/fgnt/lazy_dataset/blob/master/LICENSE) + +Lazy_dataset is a helper to deal with large datasets that do not fit into memory. +It allows to define transformations that are applied lazily, +(e.g. a mapping function to read data from HDD). When someone iterates over the dataset all +transformations are applied. + +Supported transformations: + - `dataset.map(map_fn)`: Apply the function `map_fn` to each example ([builtins.map](https://docs.python.org/3/library/functions.html#map)) + - `dataset[2]`: Get example at index `2`. + - `dataset['example_id']` Get that example that has the example id `'example_id'`. + - `dataset[10:20]`: Get a sub dataset that contains only the examples in the slice 10 to 20. + - `dataset.filter(filter_fn, lazy=True)` Drops examples where `filter_fn(example)` is false ([builtins.filter](https://docs.python.org/3/library/functions.html#filter)). + - `dataset.concatenate(*others)`: Concatenates two or more datasets ([numpy.concatenate](https://docs.scipy.org/doc/numpy-1.14.0/reference/generated/numpy.concatenate.html)) + - `dataset.intersperse(*others)`: Combine two or more datasets such that examples of each input dataset are evenly spaced (https://stackoverflow.com/a/19293603). + - `dataset.zip(*others)`: Zip two or more datasets + - `dataset.shuffle(reshuffle=False)`: Shuffles the dataset. When `reshuffle` is `True` it shuffles each time when you iterate over the data. + - `dataset.tile(reps, shuffle=False)`: Repeats the dataset `reps` times and concatenates it ([numpy.tile](https://docs.scipy.org/doc/numpy/reference/generated/numpy.tile.html)) + - `dataset.groupby(group_fn)`: Groups examples together. In contrast to `itertools.groupby` a sort is not nessesary, like in pandas ([itertools.groupby](https://docs.python.org/3/library/itertools.html#itertools.groupby), [pandas.DataFrame.groupby](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.groupby.html)) + - `dataset.sort(key_fn, sort_fn=sorted)`: Sorts the examples depending on the values `key_fn(example)` ([list.sort](https://docs.python.org/3/library/stdtypes.html#list.sort)) + - `dataset.batch(batch_size, drop_last=False)`: Batches `batch_size` examples together as a list. Usually followed by a map ([tensorflow.data.Dataset.batch](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#batch)) + - `dataset.random_choice()`: Get a random example ([numpy.random.choice](https://docs.scipy.org/doc/numpy/reference/generated/numpy.random.choice.html)) + - `dataset.cache()`: Cache in RAM (similar to ESPnet's `keep_all_data_on_mem`) + - `dataset.diskcache()`: Cache to a cache directory on the local filesystem (useful in clusters network slow filesystems) + - ... + + +```python +>>> from IPython.lib.pretty import pprint +>>> import lazy_dataset +>>> examples = { +... 'example_id_1': { +... 'observation': [1, 2, 3], +... 'label': 1, +... }, +... 'example_id_2': { +... 'observation': [4, 5, 6], +... 'label': 2, +... }, +... 'example_id_3': { +... 'observation': [7, 8, 9], +... 'label': 3, +... }, +... } +>>> for example_id, example in examples.items(): +... example['example_id'] = example_id +>>> ds = lazy_dataset.new(examples) +>>> ds + DictDataset(len=3) +MapDataset(_pickle.loads) +>>> ds.keys() +('example_id_1', 'example_id_2', 'example_id_3') +>>> for example in ds: +... print(example) +{'observation': [1, 2, 3], 'label': 1, 'example_id': 'example_id_1'} +{'observation': [4, 5, 6], 'label': 2, 'example_id': 'example_id_2'} +{'observation': [7, 8, 9], 'label': 3, 'example_id': 'example_id_3'} +>>> def transform(example): +... example['label'] *= 10 +... return example +>>> ds = ds.map(transform) +>>> for example in ds: +... print(example) +{'observation': [1, 2, 3], 'label': 10, 'example_id': 'example_id_1'} +{'observation': [4, 5, 6], 'label': 20, 'example_id': 'example_id_2'} +{'observation': [7, 8, 9], 'label': 30, 'example_id': 'example_id_3'} +>>> ds = ds.filter(lambda example: example['label'] > 15) +>>> for example in ds: +... print(example) +{'observation': [4, 5, 6], 'label': 20, 'example_id': 'example_id_2'} +{'observation': [7, 8, 9], 'label': 30, 'example_id': 'example_id_3'} +>>> ds['example_id_2'] +{'observation': [4, 5, 6], 'label': 20, 'example_id': 'example_id_2'} +>>> ds + DictDataset(len=3) + MapDataset(_pickle.loads) + MapDataset(<function transform at 0x7ff74efb6620>) +FilterDataset(<function <lambda> at 0x7ff74efb67b8>) +``` + +## Comparison with PyTorch's DataLoader + +See [here](comparison/comparison.md) for a feature and throughput comparison of lazy_dataset with PyTorch's DataLoader. + +## Installation + +Install it directly with Pip, if you just want to use it: + +```bash +pip install lazy_dataset +``` + +If you want to make changes or want the most recent version: Clone the repository and install it as follows: + +```bash +git clone https://github.com/fgnt/lazy_dataset.git +cd lazy_dataset +pip install --editable . +``` + + + + +%prep +%autosetup -n lazy_dataset-0.0.14 + +%build +%py3_build + +%install +%py3_install +install -d -m755 %{buildroot}/%{_pkgdocdir} +if [ -d doc ]; then cp -arf doc %{buildroot}/%{_pkgdocdir}; fi +if [ -d docs ]; then cp -arf docs %{buildroot}/%{_pkgdocdir}; fi +if [ -d example ]; then cp -arf example %{buildroot}/%{_pkgdocdir}; fi +if [ -d examples ]; then cp -arf examples %{buildroot}/%{_pkgdocdir}; fi +pushd %{buildroot} +if [ -d usr/lib ]; then + find usr/lib -type f -printf "\"/%h/%f\"\n" >> filelist.lst +fi +if [ -d usr/lib64 ]; then + find usr/lib64 -type f -printf "\"/%h/%f\"\n" >> filelist.lst +fi +if [ -d usr/bin ]; then + find usr/bin -type f -printf "\"/%h/%f\"\n" >> filelist.lst +fi +if [ -d usr/sbin ]; then + find usr/sbin -type f -printf "\"/%h/%f\"\n" >> filelist.lst +fi +touch doclist.lst +if [ -d usr/share/man ]; then + find usr/share/man -type f -printf "\"/%h/%f.gz\"\n" >> doclist.lst +fi +popd +mv %{buildroot}/filelist.lst . +mv %{buildroot}/doclist.lst . + +%files -n python3-lazy-dataset -f filelist.lst +%dir %{python3_sitelib}/* + +%files help -f doclist.lst +%{_docdir}/* + +%changelog +* Tue Jun 20 2023 Python_Bot <Python_Bot@openeuler.org> - 0.0.14-1 +- Package Spec generated |