From 0910cdea6aadee1d23c2cbca9bdc268a0361f931 Mon Sep 17 00:00:00 2001 From: CoprDistGit Date: Mon, 15 May 2023 07:24:53 +0000 Subject: automatic import of python-spark-etl --- .gitignore | 1 + python-spark-etl.spec | 516 ++++++++++++++++++++++++++++++++++++++++++++++++++ sources | 1 + 3 files changed, 518 insertions(+) create mode 100644 python-spark-etl.spec create mode 100644 sources diff --git a/.gitignore b/.gitignore index e69de29..2828730 100644 --- a/.gitignore +++ b/.gitignore @@ -0,0 +1 @@ +/spark-etl-0.0.122.tar.gz diff --git a/python-spark-etl.spec b/python-spark-etl.spec new file mode 100644 index 0000000..72b3bcf --- /dev/null +++ b/python-spark-etl.spec @@ -0,0 +1,516 @@ +%global _empty_manifest_terminate_build 0 +Name: python-spark-etl +Version: 0.0.122 +Release: 1 +Summary: Generic ETL Pipeline Framework for Apache Spark +License: MIT +URL: https://github.com/stonezhong/spark_etl +Source0: https://mirrors.nju.edu.cn/pypi/web/packages/9e/b4/940f4b3aea2b51b6358bfa552ea03c04001978cbd16694f666a129e5f97a/spark-etl-0.0.122.tar.gz +BuildArch: noarch + +Requires: python3-requests +Requires: python3-Jinja2 +Requires: python3-termcolor + +%description +* [Overview](#overview) + * [Goal](#goal) + * [Benefit](#benefit) + * [Application](#application) + * [Build your application](#build_your_application) + * [Deploy your application](#deploy_your_application) + * [Run your application](#run_your_application) + * [Supported platforms](#supported_platforms) +* [Demos](#demos) +* [APIs](#apis) + * [Job Deployer](#job-deployer) + * [Job Submitter](#job-submitter) + +# Overview + +## Goal +There are many public clouds provide managed Apache Spark as service, such as databricks, AWS EMR, Oracle OCI DataFlow, see the table below for a detailed list. + +However, the way to deploy Spark application and launch Spark application are incompatible among different cloud Spark platforms. + +spark-etl is a python package, provides a standard way for building, deploying and running your Spark application that supports various cloud spark platforms. + +## Benefit +Your application using `spark-etl` can be deployed and launched from different cloud spark platforms without changing the source code. + +## Application +An application is a python program. It contains: +* A `main.py` file which contains the application entry +* A `manifest.json` file, which specify the metadata of the application. +* A `requirements.txt` file, which specify the application dependency. + +### Application entry signature +In your application's `main.py`, you shuold have a `main` function with the following signature: +* `spark` is the spark session object +* `input_args` a dict, is the argument user specified when running this application. +* `sysops` is the system options passed, it is platform specific. Job submitter may inject platform specific object in `sysops` object. +* Your `main` function's return value should be a JSON object, it will be returned from the job submitter to the caller. +``` +def main(spark, input_args, sysops={}): + # your code here +``` +[Here](examples/apps/demo01) is an application example. + + +## Build your application +`etl -a build -c -p ` +## Deploy your application +`etl -a deploy -c -p -f ` +## Run your application +`etl -a run -c -p -f --run-args ` +## Supported platforms + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + You setup your own Apache Spark Cluster. +
+ + + Use PySpark package, fully compatible to other spark platform, allows you to test your pipeline in a single computer. +
+ + You host your spark cluster in databricks
+ + You host your spark cluster in Amazon AWS EMR +
+ + You host your spark cluster in Google Cloud
+ + You host your spark cluster in Microsoft Azure HDInsight
+ + + You host your spark cluster in Oracle Cloud Infrastructure, Data Flow Service +
+ + You host your spark cluster in IBM Cloud
+ +# Demos +* [Using local pyspark, access data on local disk](examples/pyspark_local/readme.md) +* [Using local pyspark, access data on AWS S3](examples/pyspark_s3/readme.md) +* [Using on-premise spark, access data on HDFS](examples/livy_hdfs1/readme.md) +* [Using on-premise spark, access data on AWS S3](examples/livy_hdfs2/readme.md) +* [Using AWS EMR's spark, access data on AWS S3](examples/aws_emr/readme.md) +* [Using Oracle OCI's Dataflow with API key, access data on Object Storage](examples/oci_dataflow1/readme.md) +* [Using Oracle OCI's Dataflow with instance principal, access data on Object Storage](examples/oci_dataflow2/readme.md) + +# APIs +[pydocs for APIs](https://stonezhong.github.io/spark_etl/pydocs/spark_etl.html) + + +## Job Deployer +For job deployers, please check the [wiki](https://github.com/stonezhong/spark_etl/wiki#job-deployer-classes) . + + +## Job Submitter +For job submitters, please check the [wiki](https://github.com/stonezhong/spark_etl/wiki#job-submitter-classes) + + + + +%package -n python3-spark-etl +Summary: Generic ETL Pipeline Framework for Apache Spark +Provides: python-spark-etl +BuildRequires: python3-devel +BuildRequires: python3-setuptools +BuildRequires: python3-pip +%description -n python3-spark-etl +* [Overview](#overview) + * [Goal](#goal) + * [Benefit](#benefit) + * [Application](#application) + * [Build your application](#build_your_application) + * [Deploy your application](#deploy_your_application) + * [Run your application](#run_your_application) + * [Supported platforms](#supported_platforms) +* [Demos](#demos) +* [APIs](#apis) + * [Job Deployer](#job-deployer) + * [Job Submitter](#job-submitter) + +# Overview + +## Goal +There are many public clouds provide managed Apache Spark as service, such as databricks, AWS EMR, Oracle OCI DataFlow, see the table below for a detailed list. + +However, the way to deploy Spark application and launch Spark application are incompatible among different cloud Spark platforms. + +spark-etl is a python package, provides a standard way for building, deploying and running your Spark application that supports various cloud spark platforms. + +## Benefit +Your application using `spark-etl` can be deployed and launched from different cloud spark platforms without changing the source code. + +## Application +An application is a python program. It contains: +* A `main.py` file which contains the application entry +* A `manifest.json` file, which specify the metadata of the application. +* A `requirements.txt` file, which specify the application dependency. + +### Application entry signature +In your application's `main.py`, you shuold have a `main` function with the following signature: +* `spark` is the spark session object +* `input_args` a dict, is the argument user specified when running this application. +* `sysops` is the system options passed, it is platform specific. Job submitter may inject platform specific object in `sysops` object. +* Your `main` function's return value should be a JSON object, it will be returned from the job submitter to the caller. +``` +def main(spark, input_args, sysops={}): + # your code here +``` +[Here](examples/apps/demo01) is an application example. + + +## Build your application +`etl -a build -c -p ` +## Deploy your application +`etl -a deploy -c -p -f ` +## Run your application +`etl -a run -c -p -f --run-args ` +## Supported platforms + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + You setup your own Apache Spark Cluster. +
+ + + Use PySpark package, fully compatible to other spark platform, allows you to test your pipeline in a single computer. +
+ + You host your spark cluster in databricks
+ + You host your spark cluster in Amazon AWS EMR +
+ + You host your spark cluster in Google Cloud
+ + You host your spark cluster in Microsoft Azure HDInsight
+ + + You host your spark cluster in Oracle Cloud Infrastructure, Data Flow Service +
+ + You host your spark cluster in IBM Cloud
+ +# Demos +* [Using local pyspark, access data on local disk](examples/pyspark_local/readme.md) +* [Using local pyspark, access data on AWS S3](examples/pyspark_s3/readme.md) +* [Using on-premise spark, access data on HDFS](examples/livy_hdfs1/readme.md) +* [Using on-premise spark, access data on AWS S3](examples/livy_hdfs2/readme.md) +* [Using AWS EMR's spark, access data on AWS S3](examples/aws_emr/readme.md) +* [Using Oracle OCI's Dataflow with API key, access data on Object Storage](examples/oci_dataflow1/readme.md) +* [Using Oracle OCI's Dataflow with instance principal, access data on Object Storage](examples/oci_dataflow2/readme.md) + +# APIs +[pydocs for APIs](https://stonezhong.github.io/spark_etl/pydocs/spark_etl.html) + + +## Job Deployer +For job deployers, please check the [wiki](https://github.com/stonezhong/spark_etl/wiki#job-deployer-classes) . + + +## Job Submitter +For job submitters, please check the [wiki](https://github.com/stonezhong/spark_etl/wiki#job-submitter-classes) + + + + +%package help +Summary: Development documents and examples for spark-etl +Provides: python3-spark-etl-doc +%description help +* [Overview](#overview) + * [Goal](#goal) + * [Benefit](#benefit) + * [Application](#application) + * [Build your application](#build_your_application) + * [Deploy your application](#deploy_your_application) + * [Run your application](#run_your_application) + * [Supported platforms](#supported_platforms) +* [Demos](#demos) +* [APIs](#apis) + * [Job Deployer](#job-deployer) + * [Job Submitter](#job-submitter) + +# Overview + +## Goal +There are many public clouds provide managed Apache Spark as service, such as databricks, AWS EMR, Oracle OCI DataFlow, see the table below for a detailed list. + +However, the way to deploy Spark application and launch Spark application are incompatible among different cloud Spark platforms. + +spark-etl is a python package, provides a standard way for building, deploying and running your Spark application that supports various cloud spark platforms. + +## Benefit +Your application using `spark-etl` can be deployed and launched from different cloud spark platforms without changing the source code. + +## Application +An application is a python program. It contains: +* A `main.py` file which contains the application entry +* A `manifest.json` file, which specify the metadata of the application. +* A `requirements.txt` file, which specify the application dependency. + +### Application entry signature +In your application's `main.py`, you shuold have a `main` function with the following signature: +* `spark` is the spark session object +* `input_args` a dict, is the argument user specified when running this application. +* `sysops` is the system options passed, it is platform specific. Job submitter may inject platform specific object in `sysops` object. +* Your `main` function's return value should be a JSON object, it will be returned from the job submitter to the caller. +``` +def main(spark, input_args, sysops={}): + # your code here +``` +[Here](examples/apps/demo01) is an application example. + + +## Build your application +`etl -a build -c -p ` +## Deploy your application +`etl -a deploy -c -p -f ` +## Run your application +`etl -a run -c -p -f --run-args ` +## Supported platforms + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + You setup your own Apache Spark Cluster. +
+ + + Use PySpark package, fully compatible to other spark platform, allows you to test your pipeline in a single computer. +
+ + You host your spark cluster in databricks
+ + You host your spark cluster in Amazon AWS EMR +
+ + You host your spark cluster in Google Cloud
+ + You host your spark cluster in Microsoft Azure HDInsight
+ + + You host your spark cluster in Oracle Cloud Infrastructure, Data Flow Service +
+ + You host your spark cluster in IBM Cloud
+ +# Demos +* [Using local pyspark, access data on local disk](examples/pyspark_local/readme.md) +* [Using local pyspark, access data on AWS S3](examples/pyspark_s3/readme.md) +* [Using on-premise spark, access data on HDFS](examples/livy_hdfs1/readme.md) +* [Using on-premise spark, access data on AWS S3](examples/livy_hdfs2/readme.md) +* [Using AWS EMR's spark, access data on AWS S3](examples/aws_emr/readme.md) +* [Using Oracle OCI's Dataflow with API key, access data on Object Storage](examples/oci_dataflow1/readme.md) +* [Using Oracle OCI's Dataflow with instance principal, access data on Object Storage](examples/oci_dataflow2/readme.md) + +# APIs +[pydocs for APIs](https://stonezhong.github.io/spark_etl/pydocs/spark_etl.html) + + +## Job Deployer +For job deployers, please check the [wiki](https://github.com/stonezhong/spark_etl/wiki#job-deployer-classes) . + + +## Job Submitter +For job submitters, please check the [wiki](https://github.com/stonezhong/spark_etl/wiki#job-submitter-classes) + + + + +%prep +%autosetup -n spark-etl-0.0.122 + +%build +%py3_build + +%install +%py3_install +install -d -m755 %{buildroot}/%{_pkgdocdir} +if [ -d doc ]; then cp -arf doc %{buildroot}/%{_pkgdocdir}; fi +if [ -d docs ]; then cp -arf docs %{buildroot}/%{_pkgdocdir}; fi +if [ -d example ]; then cp -arf example %{buildroot}/%{_pkgdocdir}; fi +if [ -d examples ]; then cp -arf examples %{buildroot}/%{_pkgdocdir}; fi +pushd %{buildroot} +if [ -d usr/lib ]; then + find usr/lib -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/lib64 ]; then + find usr/lib64 -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/bin ]; then + find usr/bin -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/sbin ]; then + find usr/sbin -type f -printf "/%h/%f\n" >> filelist.lst +fi +touch doclist.lst +if [ -d usr/share/man ]; then + find usr/share/man -type f -printf "/%h/%f.gz\n" >> doclist.lst +fi +popd +mv %{buildroot}/filelist.lst . +mv %{buildroot}/doclist.lst . + +%files -n python3-spark-etl -f filelist.lst +%dir %{python3_sitelib}/* + +%files help -f doclist.lst +%{_docdir}/* + +%changelog +* Mon May 15 2023 Python_Bot - 0.0.122-1 +- Package Spec generated diff --git a/sources b/sources new file mode 100644 index 0000000..9edc623 --- /dev/null +++ b/sources @@ -0,0 +1 @@ +2dbeec06d146eb2410e124ba20936222 spark-etl-0.0.122.tar.gz -- cgit v1.2.3