diff options
| author | CoprDistGit <infra@openeuler.org> | 2023-05-10 06:14:02 +0000 |
|---|---|---|
| committer | CoprDistGit <infra@openeuler.org> | 2023-05-10 06:14:02 +0000 |
| commit | 9f06bcc5430f93a6522501aa87396f6733a50168 (patch) | |
| tree | a02f1a57df046a32b2734cf08ae83f8917e47cce /python-hip-data-tools.spec | |
| parent | 014a935ab8e11592f697375e839a18713aa9ac7a (diff) | |
automatic import of python-hip-data-tools
Diffstat (limited to 'python-hip-data-tools.spec')
| -rw-r--r-- | python-hip-data-tools.spec | 585 |
1 files changed, 585 insertions, 0 deletions
diff --git a/python-hip-data-tools.spec b/python-hip-data-tools.spec new file mode 100644 index 0000000..5ac8f9d --- /dev/null +++ b/python-hip-data-tools.spec @@ -0,0 +1,585 @@ +%global _empty_manifest_terminate_build 0 +Name: python-hip-data-tools +Version: 1.67.2 +Release: 1 +Summary: Common Python tools and utilities for data engineering. +License: OSL +URL: https://pypi.org/project/hip-data-tools/ +Source0: https://mirrors.nju.edu.cn/pypi/web/packages/85/5e/55975ab92d2b5b1f90537237e798db805e83fb6551a5ce1f3b8b8a78d7db/hip_data_tools-1.67.2.tar.gz +BuildArch: noarch + +Requires: python3-Cython +Requires: python3-GitPython +Requires: python3-arrow +Requires: python3-boto3 +Requires: python3-botocore +Requires: python3-cassandra-driver +Requires: python3-confluent-kafka +Requires: python3-fastparquet +Requires: python3-googleads +Requires: python3-gspread +Requires: python3-joblib +Requires: python3-lxml +Requires: python3-mysqlclient +Requires: python3-oauth2client +Requires: python3-pandas +Requires: python3-pyarrow +Requires: python3-retrying +Requires: python3-stringcase + +%description +# hip-data-tools +© Hipages Group Pty Ltd 2019-2022 + +[](https://pypi.org/project/hip-data-tools/#history) +[](https://circleci.com/gh/hipagesgroup/data-tools/tree/master) +[](https://codeclimate.com/repos/5d53b4c199b9430177008586/maintainability) +[](https://codeclimate.com/repos/5d53b4c199b9430177008586/test_coverage) + +Common Python tools and utilities for data engineering, ETL, Exploration, etc. +The package is uploaded to PyPi for easy drop and use in various environmnets, such as (but not limited to): + +1. Running production workloads +2. ML Training in Jupyter like notebooks +3. Local machine for dev and exploration + + +## Installation +Install from PyPi repo: +```bash +pip3 install hip-data-tools +``` + +Install from source +```bash +pip3 install . +``` + +## MacOS Dependencies +``` +brew install libev +brew install librdkafka +``` + +## Connect to aws + +You will need to instantiate an AWS Connection: +```python +from hip_data_tools.aws.common import AwsConnectionManager, AwsConnectionSettings, AwsSecretsManager + +# to connect using an aws cli profile +conn = AwsConnectionManager(AwsConnectionSettings(region="ap-southeast-2", secrets_manager=None, profile="default")) + +# OR if you want to connect using the standard aws environment variables +conn = AwsConnectionManager(settings=AwsConnectionSettings(region="ap-southeast-2", secrets_manager=AwsSecretsManager(), profile=None)) + +# OR if you want custom set of env vars to connect +conn = AwsConnectionManager( + settings=AwsConnectionSettings( + region="ap-southeast-2", + secrets_manager=AwsSecretsManager( + access_key_id_var="SOME_CUSTOM_AWS_ACCESS_KEY_ID", + secret_access_key_var="SOME_CUSTOM_AWS_SECRET_ACCESS_KEY", + use_session_token=True, + aws_session_token_var="SOME_CUSTOM_AWS_SESSION_TOKEN" + ), + profile=None, + ) + ) + +``` + +Using this connection to object you can use the aws utilities, for example aws Athena: +```python +from hip_data_tools.aws.athena import AthenaUtil + +au = AthenaUtil(database="default", conn=conn, output_bucket="example", output_key="tmp/scratch/") +result = au.run_query("SELECT * FROM temp limit 10", return_result=True) +print(result) +``` + +## Connect to Cassandra + + ```python +from cassandra.policies import DCAwareRoundRobinPolicy +from cassandra.cqlengine import columns +from cassandra.cqlengine.management import sync_table +from cassandra.cqlengine.models import Model +from cassandra import ConsistencyLevel + +load_balancing_policy = DCAwareRoundRobinPolicy(local_dc='AWS_VPC_AP_SOUTHEAST_2') + +conn = CassandraConnectionManager( + settings = CassandraConnectionSettings( + cluster_ips=["1.1.1.1", "2.2.2.2"], + port=9042, + load_balancing_policy=load_balancing_policy, + ), + consistency_level=ConsistencyLevel.LOCAL_QUORUM +) + +conn = CassandraConnectionManager( + CassandraConnectionSettings( + cluster_ips=["1.1.1.1", "2.2.2.2"], + port=9042, + load_balancing_policy=load_balancing_policy, + secrets_manager=CassandraSecretsManager( + username_var="MY_CUSTOM_USERNAME_ENV_VAR"), + ), + consistency_level=ConsistencyLevel.LOCAL_ONE +) + +# For running Cassandra model operations +conn.setup_connection("dev_space") +class ExampleModel(Model): + example_type = columns.Integer(primary_key=True) + created_at = columns.DateTime() + description = columns.Text(required=False) +sync_table(ExampleModel) +``` + +## Connect to Google Sheets + +#### How to connect +You need to go to Google developer console and get credentials. Then the Google sheet need to be shared with client email. GoogleApiConnectionSettings need to be provided with the Google API credentials key json. Then you can access the Google sheet by using the workbook_url and the sheet name. + +#### How to instantiate Sheet Util +You can instantiate Sheet Util by providing GoogleSheetConnectionManager, workbook_url and the sheet name. +```python +sheet_util = SheetUtil( + conn_manager=GoogleSheetConnectionManager(self.settings.source_connection_settings), + workbook_url='https://docs.google.com/spreadsheets/d/cKyrzCBLfsQM/edit?usp=sharing', + sheet='Sheet1') +``` + +#### How to read a dataframe using SheetUtil +You can get the data in the Google sheet as a Pandas DataFrame using the SheetUtil. We have defined a template for the Google sheet to use with this utility. + + + +You need to provide the "field_names_row_number" and "field_types_row_number" to call "get_dataframe()" method in SheetUtil. + +```python +sheet_data = sheet_util.get_data_frame( + field_names_row_number=8, + field_types_row_number=7, + row_range="12:20", + data_start_row_number=9) +``` + + + +You can use load_sheet_to_athena() function to load Google sheet data into an Athena table. + +```python +GoogleSheetToAthena(GoogleSheetsToAthenaSettings( + source_workbook_url='https://docs.google.com/spreadsheets/d/cKyrzCBLfsQM/edit?usp=sharing', + source_sheet='spec_example', + source_row_range=None, + source_fields=None, + source_field_names_row_number=5, + source_field_types_row_number=4, + source_data_start_row_number=6, + source_connection_settings=get_google_connection_settings(gcp_conn_id=GCP_CONN_ID), + manual_partition_key_value={"column": "start_date", "value": START_DATE}, + target_database=athena_util.database, + target_table_name=TABLE_NAME, + target_s3_bucket=s3_util.bucket, + target_s3_dir=s3_dir, + target_connection_settings=get_aws_connection_settings(aws_conn_id=AWS_CONN_ID), + target_table_ddl_progress=False + )).load_sheet_to_athena() +``` + +There is an integration test called "integration_test_should__load_sheet_to_athena__when_using_sheetUtil" to test this functionality. You can simply run it by removing the "integration_" prefix. + + + +%package -n python3-hip-data-tools +Summary: Common Python tools and utilities for data engineering. +Provides: python-hip-data-tools +BuildRequires: python3-devel +BuildRequires: python3-setuptools +BuildRequires: python3-pip +%description -n python3-hip-data-tools +# hip-data-tools +© Hipages Group Pty Ltd 2019-2022 + +[](https://pypi.org/project/hip-data-tools/#history) +[](https://circleci.com/gh/hipagesgroup/data-tools/tree/master) +[](https://codeclimate.com/repos/5d53b4c199b9430177008586/maintainability) +[](https://codeclimate.com/repos/5d53b4c199b9430177008586/test_coverage) + +Common Python tools and utilities for data engineering, ETL, Exploration, etc. +The package is uploaded to PyPi for easy drop and use in various environmnets, such as (but not limited to): + +1. Running production workloads +2. ML Training in Jupyter like notebooks +3. Local machine for dev and exploration + + +## Installation +Install from PyPi repo: +```bash +pip3 install hip-data-tools +``` + +Install from source +```bash +pip3 install . +``` + +## MacOS Dependencies +``` +brew install libev +brew install librdkafka +``` + +## Connect to aws + +You will need to instantiate an AWS Connection: +```python +from hip_data_tools.aws.common import AwsConnectionManager, AwsConnectionSettings, AwsSecretsManager + +# to connect using an aws cli profile +conn = AwsConnectionManager(AwsConnectionSettings(region="ap-southeast-2", secrets_manager=None, profile="default")) + +# OR if you want to connect using the standard aws environment variables +conn = AwsConnectionManager(settings=AwsConnectionSettings(region="ap-southeast-2", secrets_manager=AwsSecretsManager(), profile=None)) + +# OR if you want custom set of env vars to connect +conn = AwsConnectionManager( + settings=AwsConnectionSettings( + region="ap-southeast-2", + secrets_manager=AwsSecretsManager( + access_key_id_var="SOME_CUSTOM_AWS_ACCESS_KEY_ID", + secret_access_key_var="SOME_CUSTOM_AWS_SECRET_ACCESS_KEY", + use_session_token=True, + aws_session_token_var="SOME_CUSTOM_AWS_SESSION_TOKEN" + ), + profile=None, + ) + ) + +``` + +Using this connection to object you can use the aws utilities, for example aws Athena: +```python +from hip_data_tools.aws.athena import AthenaUtil + +au = AthenaUtil(database="default", conn=conn, output_bucket="example", output_key="tmp/scratch/") +result = au.run_query("SELECT * FROM temp limit 10", return_result=True) +print(result) +``` + +## Connect to Cassandra + + ```python +from cassandra.policies import DCAwareRoundRobinPolicy +from cassandra.cqlengine import columns +from cassandra.cqlengine.management import sync_table +from cassandra.cqlengine.models import Model +from cassandra import ConsistencyLevel + +load_balancing_policy = DCAwareRoundRobinPolicy(local_dc='AWS_VPC_AP_SOUTHEAST_2') + +conn = CassandraConnectionManager( + settings = CassandraConnectionSettings( + cluster_ips=["1.1.1.1", "2.2.2.2"], + port=9042, + load_balancing_policy=load_balancing_policy, + ), + consistency_level=ConsistencyLevel.LOCAL_QUORUM +) + +conn = CassandraConnectionManager( + CassandraConnectionSettings( + cluster_ips=["1.1.1.1", "2.2.2.2"], + port=9042, + load_balancing_policy=load_balancing_policy, + secrets_manager=CassandraSecretsManager( + username_var="MY_CUSTOM_USERNAME_ENV_VAR"), + ), + consistency_level=ConsistencyLevel.LOCAL_ONE +) + +# For running Cassandra model operations +conn.setup_connection("dev_space") +class ExampleModel(Model): + example_type = columns.Integer(primary_key=True) + created_at = columns.DateTime() + description = columns.Text(required=False) +sync_table(ExampleModel) +``` + +## Connect to Google Sheets + +#### How to connect +You need to go to Google developer console and get credentials. Then the Google sheet need to be shared with client email. GoogleApiConnectionSettings need to be provided with the Google API credentials key json. Then you can access the Google sheet by using the workbook_url and the sheet name. + +#### How to instantiate Sheet Util +You can instantiate Sheet Util by providing GoogleSheetConnectionManager, workbook_url and the sheet name. +```python +sheet_util = SheetUtil( + conn_manager=GoogleSheetConnectionManager(self.settings.source_connection_settings), + workbook_url='https://docs.google.com/spreadsheets/d/cKyrzCBLfsQM/edit?usp=sharing', + sheet='Sheet1') +``` + +#### How to read a dataframe using SheetUtil +You can get the data in the Google sheet as a Pandas DataFrame using the SheetUtil. We have defined a template for the Google sheet to use with this utility. + + + +You need to provide the "field_names_row_number" and "field_types_row_number" to call "get_dataframe()" method in SheetUtil. + +```python +sheet_data = sheet_util.get_data_frame( + field_names_row_number=8, + field_types_row_number=7, + row_range="12:20", + data_start_row_number=9) +``` + + + +You can use load_sheet_to_athena() function to load Google sheet data into an Athena table. + +```python +GoogleSheetToAthena(GoogleSheetsToAthenaSettings( + source_workbook_url='https://docs.google.com/spreadsheets/d/cKyrzCBLfsQM/edit?usp=sharing', + source_sheet='spec_example', + source_row_range=None, + source_fields=None, + source_field_names_row_number=5, + source_field_types_row_number=4, + source_data_start_row_number=6, + source_connection_settings=get_google_connection_settings(gcp_conn_id=GCP_CONN_ID), + manual_partition_key_value={"column": "start_date", "value": START_DATE}, + target_database=athena_util.database, + target_table_name=TABLE_NAME, + target_s3_bucket=s3_util.bucket, + target_s3_dir=s3_dir, + target_connection_settings=get_aws_connection_settings(aws_conn_id=AWS_CONN_ID), + target_table_ddl_progress=False + )).load_sheet_to_athena() +``` + +There is an integration test called "integration_test_should__load_sheet_to_athena__when_using_sheetUtil" to test this functionality. You can simply run it by removing the "integration_" prefix. + + + +%package help +Summary: Development documents and examples for hip-data-tools +Provides: python3-hip-data-tools-doc +%description help +# hip-data-tools +© Hipages Group Pty Ltd 2019-2022 + +[](https://pypi.org/project/hip-data-tools/#history) +[](https://circleci.com/gh/hipagesgroup/data-tools/tree/master) +[](https://codeclimate.com/repos/5d53b4c199b9430177008586/maintainability) +[](https://codeclimate.com/repos/5d53b4c199b9430177008586/test_coverage) + +Common Python tools and utilities for data engineering, ETL, Exploration, etc. +The package is uploaded to PyPi for easy drop and use in various environmnets, such as (but not limited to): + +1. Running production workloads +2. ML Training in Jupyter like notebooks +3. Local machine for dev and exploration + + +## Installation +Install from PyPi repo: +```bash +pip3 install hip-data-tools +``` + +Install from source +```bash +pip3 install . +``` + +## MacOS Dependencies +``` +brew install libev +brew install librdkafka +``` + +## Connect to aws + +You will need to instantiate an AWS Connection: +```python +from hip_data_tools.aws.common import AwsConnectionManager, AwsConnectionSettings, AwsSecretsManager + +# to connect using an aws cli profile +conn = AwsConnectionManager(AwsConnectionSettings(region="ap-southeast-2", secrets_manager=None, profile="default")) + +# OR if you want to connect using the standard aws environment variables +conn = AwsConnectionManager(settings=AwsConnectionSettings(region="ap-southeast-2", secrets_manager=AwsSecretsManager(), profile=None)) + +# OR if you want custom set of env vars to connect +conn = AwsConnectionManager( + settings=AwsConnectionSettings( + region="ap-southeast-2", + secrets_manager=AwsSecretsManager( + access_key_id_var="SOME_CUSTOM_AWS_ACCESS_KEY_ID", + secret_access_key_var="SOME_CUSTOM_AWS_SECRET_ACCESS_KEY", + use_session_token=True, + aws_session_token_var="SOME_CUSTOM_AWS_SESSION_TOKEN" + ), + profile=None, + ) + ) + +``` + +Using this connection to object you can use the aws utilities, for example aws Athena: +```python +from hip_data_tools.aws.athena import AthenaUtil + +au = AthenaUtil(database="default", conn=conn, output_bucket="example", output_key="tmp/scratch/") +result = au.run_query("SELECT * FROM temp limit 10", return_result=True) +print(result) +``` + +## Connect to Cassandra + + ```python +from cassandra.policies import DCAwareRoundRobinPolicy +from cassandra.cqlengine import columns +from cassandra.cqlengine.management import sync_table +from cassandra.cqlengine.models import Model +from cassandra import ConsistencyLevel + +load_balancing_policy = DCAwareRoundRobinPolicy(local_dc='AWS_VPC_AP_SOUTHEAST_2') + +conn = CassandraConnectionManager( + settings = CassandraConnectionSettings( + cluster_ips=["1.1.1.1", "2.2.2.2"], + port=9042, + load_balancing_policy=load_balancing_policy, + ), + consistency_level=ConsistencyLevel.LOCAL_QUORUM +) + +conn = CassandraConnectionManager( + CassandraConnectionSettings( + cluster_ips=["1.1.1.1", "2.2.2.2"], + port=9042, + load_balancing_policy=load_balancing_policy, + secrets_manager=CassandraSecretsManager( + username_var="MY_CUSTOM_USERNAME_ENV_VAR"), + ), + consistency_level=ConsistencyLevel.LOCAL_ONE +) + +# For running Cassandra model operations +conn.setup_connection("dev_space") +class ExampleModel(Model): + example_type = columns.Integer(primary_key=True) + created_at = columns.DateTime() + description = columns.Text(required=False) +sync_table(ExampleModel) +``` + +## Connect to Google Sheets + +#### How to connect +You need to go to Google developer console and get credentials. Then the Google sheet need to be shared with client email. GoogleApiConnectionSettings need to be provided with the Google API credentials key json. Then you can access the Google sheet by using the workbook_url and the sheet name. + +#### How to instantiate Sheet Util +You can instantiate Sheet Util by providing GoogleSheetConnectionManager, workbook_url and the sheet name. +```python +sheet_util = SheetUtil( + conn_manager=GoogleSheetConnectionManager(self.settings.source_connection_settings), + workbook_url='https://docs.google.com/spreadsheets/d/cKyrzCBLfsQM/edit?usp=sharing', + sheet='Sheet1') +``` + +#### How to read a dataframe using SheetUtil +You can get the data in the Google sheet as a Pandas DataFrame using the SheetUtil. We have defined a template for the Google sheet to use with this utility. + + + +You need to provide the "field_names_row_number" and "field_types_row_number" to call "get_dataframe()" method in SheetUtil. + +```python +sheet_data = sheet_util.get_data_frame( + field_names_row_number=8, + field_types_row_number=7, + row_range="12:20", + data_start_row_number=9) +``` + + + +You can use load_sheet_to_athena() function to load Google sheet data into an Athena table. + +```python +GoogleSheetToAthena(GoogleSheetsToAthenaSettings( + source_workbook_url='https://docs.google.com/spreadsheets/d/cKyrzCBLfsQM/edit?usp=sharing', + source_sheet='spec_example', + source_row_range=None, + source_fields=None, + source_field_names_row_number=5, + source_field_types_row_number=4, + source_data_start_row_number=6, + source_connection_settings=get_google_connection_settings(gcp_conn_id=GCP_CONN_ID), + manual_partition_key_value={"column": "start_date", "value": START_DATE}, + target_database=athena_util.database, + target_table_name=TABLE_NAME, + target_s3_bucket=s3_util.bucket, + target_s3_dir=s3_dir, + target_connection_settings=get_aws_connection_settings(aws_conn_id=AWS_CONN_ID), + target_table_ddl_progress=False + )).load_sheet_to_athena() +``` + +There is an integration test called "integration_test_should__load_sheet_to_athena__when_using_sheetUtil" to test this functionality. You can simply run it by removing the "integration_" prefix. + + + +%prep +%autosetup -n hip-data-tools-1.67.2 + +%build +%py3_build + +%install +%py3_install +install -d -m755 %{buildroot}/%{_pkgdocdir} +if [ -d doc ]; then cp -arf doc %{buildroot}/%{_pkgdocdir}; fi +if [ -d docs ]; then cp -arf docs %{buildroot}/%{_pkgdocdir}; fi +if [ -d example ]; then cp -arf example %{buildroot}/%{_pkgdocdir}; fi +if [ -d examples ]; then cp -arf examples %{buildroot}/%{_pkgdocdir}; fi +pushd %{buildroot} +if [ -d usr/lib ]; then + find usr/lib -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/lib64 ]; then + find usr/lib64 -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/bin ]; then + find usr/bin -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/sbin ]; then + find usr/sbin -type f -printf "/%h/%f\n" >> filelist.lst +fi +touch doclist.lst +if [ -d usr/share/man ]; then + find usr/share/man -type f -printf "/%h/%f.gz\n" >> doclist.lst +fi +popd +mv %{buildroot}/filelist.lst . +mv %{buildroot}/doclist.lst . + +%files -n python3-hip-data-tools -f filelist.lst +%dir %{python3_sitelib}/* + +%files help -f doclist.lst +%{_docdir}/* + +%changelog +* Wed May 10 2023 Python_Bot <Python_Bot@openeuler.org> - 1.67.2-1 +- Package Spec generated |
