diff options
author | CoprDistGit <infra@openeuler.org> | 2023-04-11 03:56:00 +0000 |
---|---|---|
committer | CoprDistGit <infra@openeuler.org> | 2023-04-11 03:56:00 +0000 |
commit | 483b6ca3a5e85b79e3dd20f5161515d1e86c3c8e (patch) | |
tree | 8583efc722585e2681eac3c422f4f83e0004bccc | |
parent | a422f443c8960401259dbcbf388cf0da39f19874 (diff) |
automatic import of python-pyarrowfs-adlgen2
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | python-pyarrowfs-adlgen2.spec | 526 | ||||
-rw-r--r-- | sources | 1 |
3 files changed, 528 insertions, 0 deletions
@@ -0,0 +1 @@ +/pyarrowfs-adlgen2-0.2.4.tar.gz diff --git a/python-pyarrowfs-adlgen2.spec b/python-pyarrowfs-adlgen2.spec new file mode 100644 index 0000000..552544b --- /dev/null +++ b/python-pyarrowfs-adlgen2.spec @@ -0,0 +1,526 @@ +%global _empty_manifest_terminate_build 0 +Name: python-pyarrowfs-adlgen2 +Version: 0.2.4 +Release: 1 +Summary: Use pyarrow with Azure Data Lake gen2 +License: MIT +URL: https://github.com/kaaveland/pyarrowfs-adlgen2 +Source0: https://mirrors.nju.edu.cn/pypi/web/packages/d5/87/0b46d3f3781591604d54a9d15771f2a1c5133291cc1a177de3d7e9289b42/pyarrowfs-adlgen2-0.2.4.tar.gz +BuildArch: noarch + +Requires: python3-pyarrow +Requires: python3-azure-storage-file-datalake +Requires: python3-pandas +Requires: python3-pytest + +%description +pyarrowfs-adlgen2 +== + +pyarrowfs-adlgen2 is an implementation of a pyarrow filesystem for Azure Data Lake Gen2. + +It allows you to use pyarrow and pandas to read parquet datasets directly from Azure without the need to copy files to local storage first. + +Installation +-- + +`pip install pyarrowfs-adlgen2` + +Reading datasets +-- + +Example usage with pandas dataframe: + +```python +import azure.identity +import pandas as pd +import pyarrow.fs +import pyarrowfs_adlgen2 + +handler = pyarrowfs_adlgen2.AccountHandler.from_account_name( + 'YOUR_ACCOUNT_NAME', azure.identity.DefaultAzureCredential()) +fs = pyarrow.fs.PyFileSystem(handler) +df = pd.read_parquet('container/dataset.parq', filesystem=fs) +``` + +Example usage with arrow tables: + +```python +import azure.identity +import pyarrow.dataset +import pyarrow.fs +import pyarrowfs_adlgen2 + +handler = pyarrowfs_adlgen2.AccountHandler.from_account_name( + 'YOUR_ACCOUNT_NAME', azure.identity.DefaultAzureCredential()) +fs = pyarrow.fs.PyFileSystem(handler) +ds = pyarrow.dataset.dataset('container/dataset.parq', filesystem=fs) +table = ds.to_table() +``` + +Configuring timeouts +-- + +Timeouts are passed to azure-storage-file-datalake SDK methods. The timeout unit is in seconds. + +```python +import azure.identity +import pyarrowfs_adlgen2 + +handler = pyarrowfs_adlgen2.AccountHandler.from_account_name( + 'YOUR_ACCOUNT_NAME', + azure.identity.DefaultAzureCredential(), + timeouts=pyarrowfs_adlgen2.Timeouts(file_system_timeout=10) +) +# or mutate it: +handler.timeouts.file_client_timeout = 20 +``` + +Writing datasets +-- + +With pyarrow version 3 or greater, you can write datasets from arrow tables: + +```python +import pyarrow as pa +import pyarrow.dataset + +pyarrow.dataset.write_dataset( + table, + 'name.pq', + format='parquet', + partitioning=pyarrow.dataset.partitioning( + schema=pyarrow.schema([('year', pa.int32())]), flavor='hive' + ), + filesystem=pyarrow.fs.PyFileSystem(handler) +) +``` + +With earlier versions, files must be opened/written one at a time: + +As of pyarrow version 1.0.1, `pyarrow.parquet.ParquetWriter` does not support `pyarrow.fs.PyFileSystem`, but data can be written to open files: + +```python +with fs.open_output_stream('container/out.parq') as out: + df.to_parquet(out) +``` + +Or with arrow tables: + +```python +import pyarrow.parquet + +with fs.open_output_stream('container/out.parq') as out: + pyarrow.parquet.write_table(table, out) +``` + +Accessing only a single container/file-system +-- + +If you do not want, or can't access the whole storage account as a single filesystem, you can use `pyarrowfs_adlgen2.FilesystemHandler` to view a single file system within an account: + +```python +import azure.identity +import pyarrowfs_adlgen2 + +handler = pyarrowfs_adlgen2.FilesystemHandler.from_account_name( + "STORAGE_ACCOUNT", "FS_NAME", azure.identity.DefaultAzureCredential()) +``` + +All access is done through the file system within the storage account. + +Set http headers for files for pyarrow >= 5 +-- + +You can set headers for any output files by using the `metadata` argument to `handler.open_output_stream`: + +```python +import pyarrowfs_adlgen2 + +fs = pyarrowfs_adlgen2.AccountHandler.from_account_name("theaccount").to_fs() +metadata = {"content_type": "application/json"} +with fs.open_output_stream("container/data.json", metadata) as out: + out.write("{}") +``` + +Note that the spelling is different than you might expect! For a list of valid keys, see +[ContentSettings](https://docs.microsoft.com/en-us/python/api/azure-storage-file-datalake/azure.storage.filedatalake.contentsettings?view=azure-python). + +You can do this for pyarrow >= 5 when using `pyarrow.fs.PyFileSystem`, and for any pyarrow if using the handlers +from pyarrowfs_adlgen2 directly. + + +Running tests +-- + +To run the integration tests, you need: + +- Azure Storage Account V2 with hierarchial namespace enabled (Data Lake gen2 account) +- To configure azure login (f. ex. use `$ az login` or set up environment variables, see ` azure.identity.DefaultAzureCredential`) +- Install pytest, f. ex. `pip install pytest` + +**NB! All data in the storage account is deleted during testing, USE AN EMPTY ACCOUNT** + +``` +AZUREARROWFS_TEST_ACT=thestorageaccount pytest +``` + + +%package -n python3-pyarrowfs-adlgen2 +Summary: Use pyarrow with Azure Data Lake gen2 +Provides: python-pyarrowfs-adlgen2 +BuildRequires: python3-devel +BuildRequires: python3-setuptools +BuildRequires: python3-pip +%description -n python3-pyarrowfs-adlgen2 +pyarrowfs-adlgen2 +== + +pyarrowfs-adlgen2 is an implementation of a pyarrow filesystem for Azure Data Lake Gen2. + +It allows you to use pyarrow and pandas to read parquet datasets directly from Azure without the need to copy files to local storage first. + +Installation +-- + +`pip install pyarrowfs-adlgen2` + +Reading datasets +-- + +Example usage with pandas dataframe: + +```python +import azure.identity +import pandas as pd +import pyarrow.fs +import pyarrowfs_adlgen2 + +handler = pyarrowfs_adlgen2.AccountHandler.from_account_name( + 'YOUR_ACCOUNT_NAME', azure.identity.DefaultAzureCredential()) +fs = pyarrow.fs.PyFileSystem(handler) +df = pd.read_parquet('container/dataset.parq', filesystem=fs) +``` + +Example usage with arrow tables: + +```python +import azure.identity +import pyarrow.dataset +import pyarrow.fs +import pyarrowfs_adlgen2 + +handler = pyarrowfs_adlgen2.AccountHandler.from_account_name( + 'YOUR_ACCOUNT_NAME', azure.identity.DefaultAzureCredential()) +fs = pyarrow.fs.PyFileSystem(handler) +ds = pyarrow.dataset.dataset('container/dataset.parq', filesystem=fs) +table = ds.to_table() +``` + +Configuring timeouts +-- + +Timeouts are passed to azure-storage-file-datalake SDK methods. The timeout unit is in seconds. + +```python +import azure.identity +import pyarrowfs_adlgen2 + +handler = pyarrowfs_adlgen2.AccountHandler.from_account_name( + 'YOUR_ACCOUNT_NAME', + azure.identity.DefaultAzureCredential(), + timeouts=pyarrowfs_adlgen2.Timeouts(file_system_timeout=10) +) +# or mutate it: +handler.timeouts.file_client_timeout = 20 +``` + +Writing datasets +-- + +With pyarrow version 3 or greater, you can write datasets from arrow tables: + +```python +import pyarrow as pa +import pyarrow.dataset + +pyarrow.dataset.write_dataset( + table, + 'name.pq', + format='parquet', + partitioning=pyarrow.dataset.partitioning( + schema=pyarrow.schema([('year', pa.int32())]), flavor='hive' + ), + filesystem=pyarrow.fs.PyFileSystem(handler) +) +``` + +With earlier versions, files must be opened/written one at a time: + +As of pyarrow version 1.0.1, `pyarrow.parquet.ParquetWriter` does not support `pyarrow.fs.PyFileSystem`, but data can be written to open files: + +```python +with fs.open_output_stream('container/out.parq') as out: + df.to_parquet(out) +``` + +Or with arrow tables: + +```python +import pyarrow.parquet + +with fs.open_output_stream('container/out.parq') as out: + pyarrow.parquet.write_table(table, out) +``` + +Accessing only a single container/file-system +-- + +If you do not want, or can't access the whole storage account as a single filesystem, you can use `pyarrowfs_adlgen2.FilesystemHandler` to view a single file system within an account: + +```python +import azure.identity +import pyarrowfs_adlgen2 + +handler = pyarrowfs_adlgen2.FilesystemHandler.from_account_name( + "STORAGE_ACCOUNT", "FS_NAME", azure.identity.DefaultAzureCredential()) +``` + +All access is done through the file system within the storage account. + +Set http headers for files for pyarrow >= 5 +-- + +You can set headers for any output files by using the `metadata` argument to `handler.open_output_stream`: + +```python +import pyarrowfs_adlgen2 + +fs = pyarrowfs_adlgen2.AccountHandler.from_account_name("theaccount").to_fs() +metadata = {"content_type": "application/json"} +with fs.open_output_stream("container/data.json", metadata) as out: + out.write("{}") +``` + +Note that the spelling is different than you might expect! For a list of valid keys, see +[ContentSettings](https://docs.microsoft.com/en-us/python/api/azure-storage-file-datalake/azure.storage.filedatalake.contentsettings?view=azure-python). + +You can do this for pyarrow >= 5 when using `pyarrow.fs.PyFileSystem`, and for any pyarrow if using the handlers +from pyarrowfs_adlgen2 directly. + + +Running tests +-- + +To run the integration tests, you need: + +- Azure Storage Account V2 with hierarchial namespace enabled (Data Lake gen2 account) +- To configure azure login (f. ex. use `$ az login` or set up environment variables, see ` azure.identity.DefaultAzureCredential`) +- Install pytest, f. ex. `pip install pytest` + +**NB! All data in the storage account is deleted during testing, USE AN EMPTY ACCOUNT** + +``` +AZUREARROWFS_TEST_ACT=thestorageaccount pytest +``` + + +%package help +Summary: Development documents and examples for pyarrowfs-adlgen2 +Provides: python3-pyarrowfs-adlgen2-doc +%description help +pyarrowfs-adlgen2 +== + +pyarrowfs-adlgen2 is an implementation of a pyarrow filesystem for Azure Data Lake Gen2. + +It allows you to use pyarrow and pandas to read parquet datasets directly from Azure without the need to copy files to local storage first. + +Installation +-- + +`pip install pyarrowfs-adlgen2` + +Reading datasets +-- + +Example usage with pandas dataframe: + +```python +import azure.identity +import pandas as pd +import pyarrow.fs +import pyarrowfs_adlgen2 + +handler = pyarrowfs_adlgen2.AccountHandler.from_account_name( + 'YOUR_ACCOUNT_NAME', azure.identity.DefaultAzureCredential()) +fs = pyarrow.fs.PyFileSystem(handler) +df = pd.read_parquet('container/dataset.parq', filesystem=fs) +``` + +Example usage with arrow tables: + +```python +import azure.identity +import pyarrow.dataset +import pyarrow.fs +import pyarrowfs_adlgen2 + +handler = pyarrowfs_adlgen2.AccountHandler.from_account_name( + 'YOUR_ACCOUNT_NAME', azure.identity.DefaultAzureCredential()) +fs = pyarrow.fs.PyFileSystem(handler) +ds = pyarrow.dataset.dataset('container/dataset.parq', filesystem=fs) +table = ds.to_table() +``` + +Configuring timeouts +-- + +Timeouts are passed to azure-storage-file-datalake SDK methods. The timeout unit is in seconds. + +```python +import azure.identity +import pyarrowfs_adlgen2 + +handler = pyarrowfs_adlgen2.AccountHandler.from_account_name( + 'YOUR_ACCOUNT_NAME', + azure.identity.DefaultAzureCredential(), + timeouts=pyarrowfs_adlgen2.Timeouts(file_system_timeout=10) +) +# or mutate it: +handler.timeouts.file_client_timeout = 20 +``` + +Writing datasets +-- + +With pyarrow version 3 or greater, you can write datasets from arrow tables: + +```python +import pyarrow as pa +import pyarrow.dataset + +pyarrow.dataset.write_dataset( + table, + 'name.pq', + format='parquet', + partitioning=pyarrow.dataset.partitioning( + schema=pyarrow.schema([('year', pa.int32())]), flavor='hive' + ), + filesystem=pyarrow.fs.PyFileSystem(handler) +) +``` + +With earlier versions, files must be opened/written one at a time: + +As of pyarrow version 1.0.1, `pyarrow.parquet.ParquetWriter` does not support `pyarrow.fs.PyFileSystem`, but data can be written to open files: + +```python +with fs.open_output_stream('container/out.parq') as out: + df.to_parquet(out) +``` + +Or with arrow tables: + +```python +import pyarrow.parquet + +with fs.open_output_stream('container/out.parq') as out: + pyarrow.parquet.write_table(table, out) +``` + +Accessing only a single container/file-system +-- + +If you do not want, or can't access the whole storage account as a single filesystem, you can use `pyarrowfs_adlgen2.FilesystemHandler` to view a single file system within an account: + +```python +import azure.identity +import pyarrowfs_adlgen2 + +handler = pyarrowfs_adlgen2.FilesystemHandler.from_account_name( + "STORAGE_ACCOUNT", "FS_NAME", azure.identity.DefaultAzureCredential()) +``` + +All access is done through the file system within the storage account. + +Set http headers for files for pyarrow >= 5 +-- + +You can set headers for any output files by using the `metadata` argument to `handler.open_output_stream`: + +```python +import pyarrowfs_adlgen2 + +fs = pyarrowfs_adlgen2.AccountHandler.from_account_name("theaccount").to_fs() +metadata = {"content_type": "application/json"} +with fs.open_output_stream("container/data.json", metadata) as out: + out.write("{}") +``` + +Note that the spelling is different than you might expect! For a list of valid keys, see +[ContentSettings](https://docs.microsoft.com/en-us/python/api/azure-storage-file-datalake/azure.storage.filedatalake.contentsettings?view=azure-python). + +You can do this for pyarrow >= 5 when using `pyarrow.fs.PyFileSystem`, and for any pyarrow if using the handlers +from pyarrowfs_adlgen2 directly. + + +Running tests +-- + +To run the integration tests, you need: + +- Azure Storage Account V2 with hierarchial namespace enabled (Data Lake gen2 account) +- To configure azure login (f. ex. use `$ az login` or set up environment variables, see ` azure.identity.DefaultAzureCredential`) +- Install pytest, f. ex. `pip install pytest` + +**NB! All data in the storage account is deleted during testing, USE AN EMPTY ACCOUNT** + +``` +AZUREARROWFS_TEST_ACT=thestorageaccount pytest +``` + + +%prep +%autosetup -n pyarrowfs-adlgen2-0.2.4 + +%build +%py3_build + +%install +%py3_install +install -d -m755 %{buildroot}/%{_pkgdocdir} +if [ -d doc ]; then cp -arf doc %{buildroot}/%{_pkgdocdir}; fi +if [ -d docs ]; then cp -arf docs %{buildroot}/%{_pkgdocdir}; fi +if [ -d example ]; then cp -arf example %{buildroot}/%{_pkgdocdir}; fi +if [ -d examples ]; then cp -arf examples %{buildroot}/%{_pkgdocdir}; fi +pushd %{buildroot} +if [ -d usr/lib ]; then + find usr/lib -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/lib64 ]; then + find usr/lib64 -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/bin ]; then + find usr/bin -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/sbin ]; then + find usr/sbin -type f -printf "/%h/%f\n" >> filelist.lst +fi +touch doclist.lst +if [ -d usr/share/man ]; then + find usr/share/man -type f -printf "/%h/%f.gz\n" >> doclist.lst +fi +popd +mv %{buildroot}/filelist.lst . +mv %{buildroot}/doclist.lst . + +%files -n python3-pyarrowfs-adlgen2 -f filelist.lst +%dir %{python3_sitelib}/* + +%files help -f doclist.lst +%{_docdir}/* + +%changelog +* Tue Apr 11 2023 Python_Bot <Python_Bot@openeuler.org> - 0.2.4-1 +- Package Spec generated @@ -0,0 +1 @@ +9267467b777cdf966798ed2c45e72376 pyarrowfs-adlgen2-0.2.4.tar.gz |