summaryrefslogtreecommitdiff
path: root/python-cdxj-indexer.spec
diff options
context:
space:
mode:
authorCoprDistGit <infra@openeuler.org>2023-05-05 06:08:42 +0000
committerCoprDistGit <infra@openeuler.org>2023-05-05 06:08:42 +0000
commitc61deaf10d2e52cb66c8935193de8cf2e0f29602 (patch)
treeff6d231fef8ad8a1a085260b30542c100189eda7 /python-cdxj-indexer.spec
parent0d40f16e41f71ac7bced7406074d0d2410d3026d (diff)
automatic import of python-cdxj-indexeropeneuler20.03
Diffstat (limited to 'python-cdxj-indexer.spec')
-rw-r--r--python-cdxj-indexer.spec220
1 files changed, 220 insertions, 0 deletions
diff --git a/python-cdxj-indexer.spec b/python-cdxj-indexer.spec
new file mode 100644
index 0000000..4b253a2
--- /dev/null
+++ b/python-cdxj-indexer.spec
@@ -0,0 +1,220 @@
+%global _empty_manifest_terminate_build 0
+Name: python-cdxj-indexer
+Version: 1.4.5
+Release: 1
+Summary: CDXJ Indexer for WARC and ARC files
+License: Apache 2.0
+URL: https://github.com/webrecorder/cdxj-indexer
+Source0: https://mirrors.nju.edu.cn/pypi/web/packages/5d/2e/c245b73d2897afc0f1eb369e30b56dae1cf8ec4762086e74b200448f1fa0/cdxj_indexer-1.4.5.tar.gz
+BuildArch: noarch
+
+Requires: python3-warcio
+Requires: python3-surt
+Requires: python3-idna
+Requires: python3-py3amf
+
+%description
+CDXJ Indexer
+~~~~~~~~~~~~
+
+A command-line tool for generating CDXJ (and CDX) indexes from WARC and ARC files.
+The indexer is a new tool redesigned for fast and flexible indexing. (Based on the indexing functionality from `pywb <https://github.com/ikreymer/pywb>`_)
+
+Install with ``pip install cdxj-indexer`` or install locally with ``python setup.py install``
+
+
+The indexer supports classic CDX index format as well as the more flexible CDXJ. With CDXJ, the indexer supports custom fields and ``request`` record access for WARC files. See the examples below and the command line ``-h`` option for latest features. (This is a work in progress).
+
+
+Usage examples
+~~~~~~~~~~~~~~~~~~~~
+
+Generate CDXJ index:
+
+.. code:: console
+
+ > cdxj-indexer /path/to/archive-file.warc.gz
+ com,example)/ 20170730223850 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "G7HRM7BGOKSKMSXZAHMUQTTV53QOFSMK", "length": "1219", "offset": "771", "filename": "example-20170730223917.warc.gz"}
+
+
+CDX Index (11 field):
+
+.. code:: console
+
+ > cdxj-indexer -11 /path/to/archive-file.warc.gz
+ CDX N b a m s k r M S V g
+ com,example)/ 20170730223850 http://example.com/ text/html 200 G7HRM7BGOKSKMSXZAHMUQTTV53QOFSMK - - 1219 771 example-20170730223917.warc.gz
+
+
+More advanced use cases: add additonal http headers as fields. ``http:`` prefix specifies current record headers, while ``req.http:`` specifies corresponding request record headers. The following adds the Date, Referer headers, and the request method to the index:
+
+.. code:: console
+
+ > cdxj-indexer -f req.http:method,http:date,req.http:referer /path/to/archive-file.warc.gz
+ com,example)/ 20170801032435 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "A6DESOVDZ3WLYF57CS5E4RIC4ARPWRK7", "length": "1207", "offset": "834", "filename": "temp-20170801032445.warc.gz", "req.http:method": "GET", "http:date": "Tue, 01 Aug 2017 03:24:35 GMT", "referrer": "https://webrecorder.io/temp-NU34HBNO/temp/recording-session/record/http://example.com/"}
+ org,iana)/domains/example 20170801032437 {"url": "http://www.iana.org/domains/example", "mime": "text/html", "status": "302", "digest": "RP3Y66FDBYBZKSFYQ4VJ4RMDA5BPDJX2", "length": "675", "offset": "2652", "filename": "temp-20170801032445.warc.gz", "req.http:method": "GET", "http:date": "Tue, 01 Aug 2017 02:35:05 GMT", "referrer": "http://example.com/"}
+
+
+The CDXJ Indexer extends the ``Indexer`` functionality in `warcio <https://github.com/webrecorder/warcio>`_ and should be flexible to extend.
+
+
+
+
+
+
+
+
+%package -n python3-cdxj-indexer
+Summary: CDXJ Indexer for WARC and ARC files
+Provides: python-cdxj-indexer
+BuildRequires: python3-devel
+BuildRequires: python3-setuptools
+BuildRequires: python3-pip
+%description -n python3-cdxj-indexer
+CDXJ Indexer
+~~~~~~~~~~~~
+
+A command-line tool for generating CDXJ (and CDX) indexes from WARC and ARC files.
+The indexer is a new tool redesigned for fast and flexible indexing. (Based on the indexing functionality from `pywb <https://github.com/ikreymer/pywb>`_)
+
+Install with ``pip install cdxj-indexer`` or install locally with ``python setup.py install``
+
+
+The indexer supports classic CDX index format as well as the more flexible CDXJ. With CDXJ, the indexer supports custom fields and ``request`` record access for WARC files. See the examples below and the command line ``-h`` option for latest features. (This is a work in progress).
+
+
+Usage examples
+~~~~~~~~~~~~~~~~~~~~
+
+Generate CDXJ index:
+
+.. code:: console
+
+ > cdxj-indexer /path/to/archive-file.warc.gz
+ com,example)/ 20170730223850 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "G7HRM7BGOKSKMSXZAHMUQTTV53QOFSMK", "length": "1219", "offset": "771", "filename": "example-20170730223917.warc.gz"}
+
+
+CDX Index (11 field):
+
+.. code:: console
+
+ > cdxj-indexer -11 /path/to/archive-file.warc.gz
+ CDX N b a m s k r M S V g
+ com,example)/ 20170730223850 http://example.com/ text/html 200 G7HRM7BGOKSKMSXZAHMUQTTV53QOFSMK - - 1219 771 example-20170730223917.warc.gz
+
+
+More advanced use cases: add additonal http headers as fields. ``http:`` prefix specifies current record headers, while ``req.http:`` specifies corresponding request record headers. The following adds the Date, Referer headers, and the request method to the index:
+
+.. code:: console
+
+ > cdxj-indexer -f req.http:method,http:date,req.http:referer /path/to/archive-file.warc.gz
+ com,example)/ 20170801032435 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "A6DESOVDZ3WLYF57CS5E4RIC4ARPWRK7", "length": "1207", "offset": "834", "filename": "temp-20170801032445.warc.gz", "req.http:method": "GET", "http:date": "Tue, 01 Aug 2017 03:24:35 GMT", "referrer": "https://webrecorder.io/temp-NU34HBNO/temp/recording-session/record/http://example.com/"}
+ org,iana)/domains/example 20170801032437 {"url": "http://www.iana.org/domains/example", "mime": "text/html", "status": "302", "digest": "RP3Y66FDBYBZKSFYQ4VJ4RMDA5BPDJX2", "length": "675", "offset": "2652", "filename": "temp-20170801032445.warc.gz", "req.http:method": "GET", "http:date": "Tue, 01 Aug 2017 02:35:05 GMT", "referrer": "http://example.com/"}
+
+
+The CDXJ Indexer extends the ``Indexer`` functionality in `warcio <https://github.com/webrecorder/warcio>`_ and should be flexible to extend.
+
+
+
+
+
+
+
+
+%package help
+Summary: Development documents and examples for cdxj-indexer
+Provides: python3-cdxj-indexer-doc
+%description help
+CDXJ Indexer
+~~~~~~~~~~~~
+
+A command-line tool for generating CDXJ (and CDX) indexes from WARC and ARC files.
+The indexer is a new tool redesigned for fast and flexible indexing. (Based on the indexing functionality from `pywb <https://github.com/ikreymer/pywb>`_)
+
+Install with ``pip install cdxj-indexer`` or install locally with ``python setup.py install``
+
+
+The indexer supports classic CDX index format as well as the more flexible CDXJ. With CDXJ, the indexer supports custom fields and ``request`` record access for WARC files. See the examples below and the command line ``-h`` option for latest features. (This is a work in progress).
+
+
+Usage examples
+~~~~~~~~~~~~~~~~~~~~
+
+Generate CDXJ index:
+
+.. code:: console
+
+ > cdxj-indexer /path/to/archive-file.warc.gz
+ com,example)/ 20170730223850 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "G7HRM7BGOKSKMSXZAHMUQTTV53QOFSMK", "length": "1219", "offset": "771", "filename": "example-20170730223917.warc.gz"}
+
+
+CDX Index (11 field):
+
+.. code:: console
+
+ > cdxj-indexer -11 /path/to/archive-file.warc.gz
+ CDX N b a m s k r M S V g
+ com,example)/ 20170730223850 http://example.com/ text/html 200 G7HRM7BGOKSKMSXZAHMUQTTV53QOFSMK - - 1219 771 example-20170730223917.warc.gz
+
+
+More advanced use cases: add additonal http headers as fields. ``http:`` prefix specifies current record headers, while ``req.http:`` specifies corresponding request record headers. The following adds the Date, Referer headers, and the request method to the index:
+
+.. code:: console
+
+ > cdxj-indexer -f req.http:method,http:date,req.http:referer /path/to/archive-file.warc.gz
+ com,example)/ 20170801032435 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "A6DESOVDZ3WLYF57CS5E4RIC4ARPWRK7", "length": "1207", "offset": "834", "filename": "temp-20170801032445.warc.gz", "req.http:method": "GET", "http:date": "Tue, 01 Aug 2017 03:24:35 GMT", "referrer": "https://webrecorder.io/temp-NU34HBNO/temp/recording-session/record/http://example.com/"}
+ org,iana)/domains/example 20170801032437 {"url": "http://www.iana.org/domains/example", "mime": "text/html", "status": "302", "digest": "RP3Y66FDBYBZKSFYQ4VJ4RMDA5BPDJX2", "length": "675", "offset": "2652", "filename": "temp-20170801032445.warc.gz", "req.http:method": "GET", "http:date": "Tue, 01 Aug 2017 02:35:05 GMT", "referrer": "http://example.com/"}
+
+
+The CDXJ Indexer extends the ``Indexer`` functionality in `warcio <https://github.com/webrecorder/warcio>`_ and should be flexible to extend.
+
+
+
+
+
+
+
+
+%prep
+%autosetup -n cdxj-indexer-1.4.5
+
+%build
+%py3_build
+
+%install
+%py3_install
+install -d -m755 %{buildroot}/%{_pkgdocdir}
+if [ -d doc ]; then cp -arf doc %{buildroot}/%{_pkgdocdir}; fi
+if [ -d docs ]; then cp -arf docs %{buildroot}/%{_pkgdocdir}; fi
+if [ -d example ]; then cp -arf example %{buildroot}/%{_pkgdocdir}; fi
+if [ -d examples ]; then cp -arf examples %{buildroot}/%{_pkgdocdir}; fi
+pushd %{buildroot}
+if [ -d usr/lib ]; then
+ find usr/lib -type f -printf "/%h/%f\n" >> filelist.lst
+fi
+if [ -d usr/lib64 ]; then
+ find usr/lib64 -type f -printf "/%h/%f\n" >> filelist.lst
+fi
+if [ -d usr/bin ]; then
+ find usr/bin -type f -printf "/%h/%f\n" >> filelist.lst
+fi
+if [ -d usr/sbin ]; then
+ find usr/sbin -type f -printf "/%h/%f\n" >> filelist.lst
+fi
+touch doclist.lst
+if [ -d usr/share/man ]; then
+ find usr/share/man -type f -printf "/%h/%f.gz\n" >> doclist.lst
+fi
+popd
+mv %{buildroot}/filelist.lst .
+mv %{buildroot}/doclist.lst .
+
+%files -n python3-cdxj-indexer -f filelist.lst
+%dir %{python3_sitelib}/*
+
+%files help -f doclist.lst
+%{_docdir}/*
+
+%changelog
+* Fri May 05 2023 Python_Bot <Python_Bot@openeuler.org> - 1.4.5-1
+- Package Spec generated