diff options
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | python-s3webcache.spec | 309 | ||||
-rw-r--r-- | sources | 1 |
3 files changed, 311 insertions, 0 deletions
@@ -0,0 +1 @@ +/S3WebCache-0.2.2.tar.gz diff --git a/python-s3webcache.spec b/python-s3webcache.spec new file mode 100644 index 0000000..64907ba --- /dev/null +++ b/python-s3webcache.spec @@ -0,0 +1,309 @@ +%global _empty_manifest_terminate_build 0 +Name: python-S3WebCache +Version: 0.2.2 +Release: 1 +Summary: please add a summary manually as the author left a blank one +License: MIT +URL: https://pypi.org/project/S3WebCache/ +Source0: https://mirrors.aliyun.com/pypi/web/packages/a5/46/ea7dfad1bf73ab6a179b387da3832a086d488e730f4194b20b041b95e6a7/S3WebCache-0.2.2.tar.gz +BuildArch: noarch + + +%description +[](https://travis-ci.org/wharton/S3WebCache) +[](https://badge.fury.io/py/S3WebCache) +[](https://opensource.org/licenses/MIT) + +# S3 Web Cache + +This is a simple package for archiving web pages (HTML) to S3. It acts as a cache returning the S3 version of the page if it exists. If not it gets the url through [Requests](http://docs.python-requests.org/en/master/) and archives it in s3. + +Our use case: provide a reusable history of pages included in a web scrape. An archived version of a particular URL at a moment in time. Since the web is always changing, different research questions can be asked at a later date, without losing the original content. Please only use in this manner if you have obtained permission for the pages you are requesting. + + +## Quickstart + + +### Install + +`pip install s3webcache` + + +### Usage + +``` +from s3webcache import S3WebCache + +s3wc = S3WebCache( + bucket_name=<BUCKET>, + aws_access_key_id=<AWS_ACCESS_KEY_ID>, + aws_secret_key=<AWS_SECRET_ACCESS_KEY>, + aws_default_region=<AWS_DEFAULT_REGION>) + +request = s3wc.get("https://en.wikipedia.org/wiki/Whole_Earth_Catalog") + +if request.success: + html = request.message +``` + +If the required AWS credentials are not given it will fallback to using environment variables. + +The `.get(url)` operation returns a namedtuple Request: (success: bool, message: str). + +For successful operations, `.message` contains the url data. +For unsuccessful operations, `.message` contains error information. + + +### Options + +S3WebCache() takes the following arguments with these defaults: + - bucket_name: str + - path_prefix: str = None\ + Subdirectories to store URLs. `path_prefix='ht'` will start archiving at path s3://BUCKETNAME/ht/ + - aws_access_key_id: str = None + - aws_secret_key: str = None + - aws_default_region: str = None + - trim_website: bool = False + Trim out the hostname. Defaults to storing the hostname as dot replaced underscores. `https://github.com/wharton/S3WebCache` would be `s3://BUCKETNAME/github_com/wharton/S3WebCache`.\ + Set this to true and it will be stored as `s3://BUCKETNAME/wharton/S3WebCache`. + - allow_forwarding: bool = True + Will follow HTTP 300 class redirects. + + +## TODO + + - Add 'update s3 if file is older than...' behavior + - Add transparent compression by default (gzip, lz4, etc) + - Add rate limiting + + +## Reference + +[AWS S3 API documentation](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html) + + +## License + +MIT + + +## Tests + +Through Travis-ci + +%package -n python3-S3WebCache +Summary: please add a summary manually as the author left a blank one +Provides: python-S3WebCache +BuildRequires: python3-devel +BuildRequires: python3-setuptools +BuildRequires: python3-pip +%description -n python3-S3WebCache +[](https://travis-ci.org/wharton/S3WebCache) +[](https://badge.fury.io/py/S3WebCache) +[](https://opensource.org/licenses/MIT) + +# S3 Web Cache + +This is a simple package for archiving web pages (HTML) to S3. It acts as a cache returning the S3 version of the page if it exists. If not it gets the url through [Requests](http://docs.python-requests.org/en/master/) and archives it in s3. + +Our use case: provide a reusable history of pages included in a web scrape. An archived version of a particular URL at a moment in time. Since the web is always changing, different research questions can be asked at a later date, without losing the original content. Please only use in this manner if you have obtained permission for the pages you are requesting. + + +## Quickstart + + +### Install + +`pip install s3webcache` + + +### Usage + +``` +from s3webcache import S3WebCache + +s3wc = S3WebCache( + bucket_name=<BUCKET>, + aws_access_key_id=<AWS_ACCESS_KEY_ID>, + aws_secret_key=<AWS_SECRET_ACCESS_KEY>, + aws_default_region=<AWS_DEFAULT_REGION>) + +request = s3wc.get("https://en.wikipedia.org/wiki/Whole_Earth_Catalog") + +if request.success: + html = request.message +``` + +If the required AWS credentials are not given it will fallback to using environment variables. + +The `.get(url)` operation returns a namedtuple Request: (success: bool, message: str). + +For successful operations, `.message` contains the url data. +For unsuccessful operations, `.message` contains error information. + + +### Options + +S3WebCache() takes the following arguments with these defaults: + - bucket_name: str + - path_prefix: str = None\ + Subdirectories to store URLs. `path_prefix='ht'` will start archiving at path s3://BUCKETNAME/ht/ + - aws_access_key_id: str = None + - aws_secret_key: str = None + - aws_default_region: str = None + - trim_website: bool = False + Trim out the hostname. Defaults to storing the hostname as dot replaced underscores. `https://github.com/wharton/S3WebCache` would be `s3://BUCKETNAME/github_com/wharton/S3WebCache`.\ + Set this to true and it will be stored as `s3://BUCKETNAME/wharton/S3WebCache`. + - allow_forwarding: bool = True + Will follow HTTP 300 class redirects. + + +## TODO + + - Add 'update s3 if file is older than...' behavior + - Add transparent compression by default (gzip, lz4, etc) + - Add rate limiting + + +## Reference + +[AWS S3 API documentation](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html) + + +## License + +MIT + + +## Tests + +Through Travis-ci + +%package help +Summary: Development documents and examples for S3WebCache +Provides: python3-S3WebCache-doc +%description help +[](https://travis-ci.org/wharton/S3WebCache) +[](https://badge.fury.io/py/S3WebCache) +[](https://opensource.org/licenses/MIT) + +# S3 Web Cache + +This is a simple package for archiving web pages (HTML) to S3. It acts as a cache returning the S3 version of the page if it exists. If not it gets the url through [Requests](http://docs.python-requests.org/en/master/) and archives it in s3. + +Our use case: provide a reusable history of pages included in a web scrape. An archived version of a particular URL at a moment in time. Since the web is always changing, different research questions can be asked at a later date, without losing the original content. Please only use in this manner if you have obtained permission for the pages you are requesting. + + +## Quickstart + + +### Install + +`pip install s3webcache` + + +### Usage + +``` +from s3webcache import S3WebCache + +s3wc = S3WebCache( + bucket_name=<BUCKET>, + aws_access_key_id=<AWS_ACCESS_KEY_ID>, + aws_secret_key=<AWS_SECRET_ACCESS_KEY>, + aws_default_region=<AWS_DEFAULT_REGION>) + +request = s3wc.get("https://en.wikipedia.org/wiki/Whole_Earth_Catalog") + +if request.success: + html = request.message +``` + +If the required AWS credentials are not given it will fallback to using environment variables. + +The `.get(url)` operation returns a namedtuple Request: (success: bool, message: str). + +For successful operations, `.message` contains the url data. +For unsuccessful operations, `.message` contains error information. + + +### Options + +S3WebCache() takes the following arguments with these defaults: + - bucket_name: str + - path_prefix: str = None\ + Subdirectories to store URLs. `path_prefix='ht'` will start archiving at path s3://BUCKETNAME/ht/ + - aws_access_key_id: str = None + - aws_secret_key: str = None + - aws_default_region: str = None + - trim_website: bool = False + Trim out the hostname. Defaults to storing the hostname as dot replaced underscores. `https://github.com/wharton/S3WebCache` would be `s3://BUCKETNAME/github_com/wharton/S3WebCache`.\ + Set this to true and it will be stored as `s3://BUCKETNAME/wharton/S3WebCache`. + - allow_forwarding: bool = True + Will follow HTTP 300 class redirects. + + +## TODO + + - Add 'update s3 if file is older than...' behavior + - Add transparent compression by default (gzip, lz4, etc) + - Add rate limiting + + +## Reference + +[AWS S3 API documentation](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html) + + +## License + +MIT + + +## Tests + +Through Travis-ci + +%prep +%autosetup -n S3WebCache-0.2.2 + +%build +%py3_build + +%install +%py3_install +install -d -m755 %{buildroot}/%{_pkgdocdir} +if [ -d doc ]; then cp -arf doc %{buildroot}/%{_pkgdocdir}; fi +if [ -d docs ]; then cp -arf docs %{buildroot}/%{_pkgdocdir}; fi +if [ -d example ]; then cp -arf example %{buildroot}/%{_pkgdocdir}; fi +if [ -d examples ]; then cp -arf examples %{buildroot}/%{_pkgdocdir}; fi +pushd %{buildroot} +if [ -d usr/lib ]; then + find usr/lib -type f -printf "\"/%h/%f\"\n" >> filelist.lst +fi +if [ -d usr/lib64 ]; then + find usr/lib64 -type f -printf "\"/%h/%f\"\n" >> filelist.lst +fi +if [ -d usr/bin ]; then + find usr/bin -type f -printf "\"/%h/%f\"\n" >> filelist.lst +fi +if [ -d usr/sbin ]; then + find usr/sbin -type f -printf "\"/%h/%f\"\n" >> filelist.lst +fi +touch doclist.lst +if [ -d usr/share/man ]; then + find usr/share/man -type f -printf "\"/%h/%f.gz\"\n" >> doclist.lst +fi +popd +mv %{buildroot}/filelist.lst . +mv %{buildroot}/doclist.lst . + +%files -n python3-S3WebCache -f filelist.lst +%dir %{python3_sitelib}/* + +%files help -f doclist.lst +%{_docdir}/* + +%changelog +* Tue Jun 20 2023 Python_Bot <Python_Bot@openeuler.org> - 0.2.2-1 +- Package Spec generated @@ -0,0 +1 @@ +2fe89a525b9dd38ea92f8528367bafed S3WebCache-0.2.2.tar.gz |