From e069a496577dbc58edab79ad3d0d75a8b2905092 Mon Sep 17 00:00:00 2001 From: CoprDistGit Date: Fri, 5 May 2023 13:08:41 +0000 Subject: automatic import of python-chompjs --- .gitignore | 1 + python-chompjs.spec | 696 ++++++++++++++++++++++++++++++++++++++++++++++++++++ sources | 1 + 3 files changed, 698 insertions(+) create mode 100644 python-chompjs.spec create mode 100644 sources diff --git a/.gitignore b/.gitignore index e69de29..0c55c71 100644 --- a/.gitignore +++ b/.gitignore @@ -0,0 +1 @@ +/chompjs-1.1.9.tar.gz diff --git a/python-chompjs.spec b/python-chompjs.spec new file mode 100644 index 0000000..3417d8c --- /dev/null +++ b/python-chompjs.spec @@ -0,0 +1,696 @@ +%global _empty_manifest_terminate_build 0 +Name: python-chompjs +Version: 1.1.9 +Release: 1 +Summary: Parsing JavaScript objects into Python dictionaries +License: MIT License +URL: https://github.com/Nykakin/chompjs +Source0: https://mirrors.nju.edu.cn/pypi/web/packages/24/72/22660aba976ba8e31aee62be5e69666df895286c535635660d4b925029fc/chompjs-1.1.9.tar.gz +BuildArch: noarch + + +%description +# Usage + +`chompjs` can be used in web scrapping for turning JavaScript objects embedded in pages into valid Python dictionaries. + +```python +>>> import chompjs +>>> chompjs.parse_js_object('{"my_data": "test"}') +{u'my_data': u'test'} +``` + +Think of it as a more powerful `json.loads`. For example, it can handle JSON objects containing embedded methods by storing their code in a string: + +```python +>>> import chompjs +>>> js = """ +... var myObj = { +... myMethod: function(params) { +... // ... +... }, +... myValue: 100 +... } +... """ +>>> chompjs.parse_js_object(js, json_params={'strict': False}) +{'myMethod': 'function(params) {\n // ...\n }', 'myValue': 100} +``` + +An example usage with `scrapy`: + +```python +import chompjs +import scrapy + + +class MySpider(scrapy.Spider): + # ... + + def parse(self, response): + script_css = 'script:contains("__NEXT_DATA__")::text' + script_pattern = r'__NEXT_DATA__ = (.*);' + # warning: for some pages you need to pass replace_entities=True + # into re_first to have JSON escaped properly + script_text = response.css(script_css).re_first(script_pattern) + try: + json_data = chompjs.parse_js_object(script_text) + except ValueError: + self.log('Failed to extract data from {}'.format(response.url)) + return + + # work on json_data +``` + +If the input string is not yet escaped and contains a lot of `\\` characters, then `unicode_escape=True` argument might help to sanitize it: + +```python +>>> chompjs.parse_js_object('{\\\"a\\\": 12}', unicode_escape=True) +{u'a': 12} +``` + +`jsonlines=True` can be used to parse JSON Lines: + +```python +>>> chompjs.parse_js_object('[1,2]\n[2,3]\n[3,4]', jsonlines=True) +[[1, 2], [2, 3], [3, 4]] +``` + +By default `chompjs` tries to start with first `{` or `[` character it founds, omitting the rest: + +```python +>>> chompjs.parse_js_object('
...
...
') +[1, 2, 3] +``` + +`json_params` argument can be used to pass options to underlying `json_loads`, such as `strict` or `object_hook`: + +```python +>>> import decimal +>>> import chompjs +>>> chompjs.parse_js_object('[23.2]', json_params={'parse_float': decimal.Decimal}) +[Decimal('23.2')] +``` + +# Rationale + +In web scraping data often is not present directly inside HTML, but instead provided as an embedded JavaScript object that is later used to initialize the page, for example: + +```html + +... + +... + +... + + +``` + +Standard library function `json.loads` is usually sufficient to extract this data: + +```python +>>> # scrapy shell file:///tmp/test.html +>>> import json +>>> script_text = response.css('script:contains(__PRELOADED_STATE__)::text').re_first('__PRELOADED_STATE__=(.*)') +>>> json.loads(script_text) +{u'foo': u'bar'} + +``` +The problem is that not all valid JavaScript objects are also valid JSONs. For example all those strings are valid JavaScript objects but not valid JSONs: + +* `"{'a': 'b'}"` is not a valid JSON because it uses `'` character to quote +* `'{a: "b"}'`is not a valid JSON because property name is not quoted at all +* `'{"a": [1, 2, 3,]}'` is not a valid JSON because there is an extra `,` character at the end of the array +* `'{"a": .99}'` is not a valid JSON because float value lacks a leading 0 + +As a result, `json.loads` fail to extract any of those: + +``` +>>> json.loads("{'a': 'b'}") +Traceback (most recent call last): + File "", line 1, in + File "/usr/lib/python2.7/json/__init__.py", line 339, in loads + return _default_decoder.decode(s) + File "/usr/lib/python2.7/json/decoder.py", line 364, in decode + obj, end = self.raw_decode(s, idx=_w(s, 0).end()) + File "/usr/lib/python2.7/json/decoder.py", line 380, in raw_decode + obj, end = self.scan_once(s, idx) +ValueError: Expecting property name: line 1 column 2 (char 1) +>>> json.loads('{a: "b"}') +Traceback (most recent call last): + File "", line 1, in + File "/usr/lib/python2.7/json/__init__.py", line 339, in loads + return _default_decoder.decode(s) + File "/usr/lib/python2.7/json/decoder.py", line 364, in decode + obj, end = self.raw_decode(s, idx=_w(s, 0).end()) + File "/usr/lib/python2.7/json/decoder.py", line 380, in raw_decode + obj, end = self.scan_once(s, idx) +ValueError: Expecting property name: line 1 column 2 (char 1) +>>> json.loads('{"a": [1, 2, 3,]}') +Traceback (most recent call last): + File "", line 1, in + File "/usr/lib/python2.7/json/__init__.py", line 339, in loads + return _default_decoder.decode(s) + File "/usr/lib/python2.7/json/decoder.py", line 364, in decode + obj, end = self.raw_decode(s, idx=_w(s, 0).end()) + File "/usr/lib/python2.7/json/decoder.py", line 382, in raw_decode + raise ValueError("No JSON object could be decoded") +ValueError: No JSON object could be decoded +>>> json.loads('{"a": .99}') +Traceback (most recent call last): + File "", line 1, in + File "/usr/lib/python3.7/json/__init__.py", line 348, in loads + return _default_decoder.decode(s) + File "/usr/lib/python3.7/json/decoder.py", line 337, in decode + obj, end = self.raw_decode(s, idx=_w(s, 0).end()) + File "/usr/lib/python3.7/json/decoder.py", line 355, in raw_decode + raise JSONDecodeError("Expecting value", s, err.value) from None +json.decoder.JSONDecodeError: Expecting value: line 1 column 7 (char 6) + +``` +`chompjs` library was designed to bypass this limitation, and it allows to scrape such JavaScript objects into proper Python dictionaries: + +``` +>>> import chompjs +>>> +>>> chompjs.parse_js_object("{'a': 'b'}") +{u'a': u'b'} +>>> chompjs.parse_js_object('{a: "b"}') +{u'a': u'b'} +>>> chompjs.parse_js_object('{"a": [1, 2, 3,]}') +{u'a': [1, 2, 3]} +``` + +Internally `chompjs` use a parser written in C to iterate over raw string, fixing its issues along the way. The final result is then passed down to standard library's `json.loads`, ensuring a high speed as compared to full-blown JavaScript parsers such as `demjson`. + +``` +>>> import json +>>> import _chompjs +>>> +>>> _chompjs.parse('{a: 1}') +'{"a":1}' +>>> json.loads(_) +{u'a': 1} +>>> chompjs.parse_js_object('{"a": .99}') +{'a': 0.99} +``` + +# Installation +From PIP: + +```bash +$ python3 -m venv venv +$ . venv/bin/activate +# pip install chompjs +``` +From sources: +```bash +$ git clone https://github.com/Nykakin/chompjs +$ cd chompjs +$ python setup.py build +$ python setup.py install +``` + +To run unittests + +``` +$ python -m unittest +``` + + + + +%package -n python3-chompjs +Summary: Parsing JavaScript objects into Python dictionaries +Provides: python-chompjs +BuildRequires: python3-devel +BuildRequires: python3-setuptools +BuildRequires: python3-pip +%description -n python3-chompjs +# Usage + +`chompjs` can be used in web scrapping for turning JavaScript objects embedded in pages into valid Python dictionaries. + +```python +>>> import chompjs +>>> chompjs.parse_js_object('{"my_data": "test"}') +{u'my_data': u'test'} +``` + +Think of it as a more powerful `json.loads`. For example, it can handle JSON objects containing embedded methods by storing their code in a string: + +```python +>>> import chompjs +>>> js = """ +... var myObj = { +... myMethod: function(params) { +... // ... +... }, +... myValue: 100 +... } +... """ +>>> chompjs.parse_js_object(js, json_params={'strict': False}) +{'myMethod': 'function(params) {\n // ...\n }', 'myValue': 100} +``` + +An example usage with `scrapy`: + +```python +import chompjs +import scrapy + + +class MySpider(scrapy.Spider): + # ... + + def parse(self, response): + script_css = 'script:contains("__NEXT_DATA__")::text' + script_pattern = r'__NEXT_DATA__ = (.*);' + # warning: for some pages you need to pass replace_entities=True + # into re_first to have JSON escaped properly + script_text = response.css(script_css).re_first(script_pattern) + try: + json_data = chompjs.parse_js_object(script_text) + except ValueError: + self.log('Failed to extract data from {}'.format(response.url)) + return + + # work on json_data +``` + +If the input string is not yet escaped and contains a lot of `\\` characters, then `unicode_escape=True` argument might help to sanitize it: + +```python +>>> chompjs.parse_js_object('{\\\"a\\\": 12}', unicode_escape=True) +{u'a': 12} +``` + +`jsonlines=True` can be used to parse JSON Lines: + +```python +>>> chompjs.parse_js_object('[1,2]\n[2,3]\n[3,4]', jsonlines=True) +[[1, 2], [2, 3], [3, 4]] +``` + +By default `chompjs` tries to start with first `{` or `[` character it founds, omitting the rest: + +```python +>>> chompjs.parse_js_object('
...
...
') +[1, 2, 3] +``` + +`json_params` argument can be used to pass options to underlying `json_loads`, such as `strict` or `object_hook`: + +```python +>>> import decimal +>>> import chompjs +>>> chompjs.parse_js_object('[23.2]', json_params={'parse_float': decimal.Decimal}) +[Decimal('23.2')] +``` + +# Rationale + +In web scraping data often is not present directly inside HTML, but instead provided as an embedded JavaScript object that is later used to initialize the page, for example: + +```html + +... + +... + +... + + +``` + +Standard library function `json.loads` is usually sufficient to extract this data: + +```python +>>> # scrapy shell file:///tmp/test.html +>>> import json +>>> script_text = response.css('script:contains(__PRELOADED_STATE__)::text').re_first('__PRELOADED_STATE__=(.*)') +>>> json.loads(script_text) +{u'foo': u'bar'} + +``` +The problem is that not all valid JavaScript objects are also valid JSONs. For example all those strings are valid JavaScript objects but not valid JSONs: + +* `"{'a': 'b'}"` is not a valid JSON because it uses `'` character to quote +* `'{a: "b"}'`is not a valid JSON because property name is not quoted at all +* `'{"a": [1, 2, 3,]}'` is not a valid JSON because there is an extra `,` character at the end of the array +* `'{"a": .99}'` is not a valid JSON because float value lacks a leading 0 + +As a result, `json.loads` fail to extract any of those: + +``` +>>> json.loads("{'a': 'b'}") +Traceback (most recent call last): + File "", line 1, in + File "/usr/lib/python2.7/json/__init__.py", line 339, in loads + return _default_decoder.decode(s) + File "/usr/lib/python2.7/json/decoder.py", line 364, in decode + obj, end = self.raw_decode(s, idx=_w(s, 0).end()) + File "/usr/lib/python2.7/json/decoder.py", line 380, in raw_decode + obj, end = self.scan_once(s, idx) +ValueError: Expecting property name: line 1 column 2 (char 1) +>>> json.loads('{a: "b"}') +Traceback (most recent call last): + File "", line 1, in + File "/usr/lib/python2.7/json/__init__.py", line 339, in loads + return _default_decoder.decode(s) + File "/usr/lib/python2.7/json/decoder.py", line 364, in decode + obj, end = self.raw_decode(s, idx=_w(s, 0).end()) + File "/usr/lib/python2.7/json/decoder.py", line 380, in raw_decode + obj, end = self.scan_once(s, idx) +ValueError: Expecting property name: line 1 column 2 (char 1) +>>> json.loads('{"a": [1, 2, 3,]}') +Traceback (most recent call last): + File "", line 1, in + File "/usr/lib/python2.7/json/__init__.py", line 339, in loads + return _default_decoder.decode(s) + File "/usr/lib/python2.7/json/decoder.py", line 364, in decode + obj, end = self.raw_decode(s, idx=_w(s, 0).end()) + File "/usr/lib/python2.7/json/decoder.py", line 382, in raw_decode + raise ValueError("No JSON object could be decoded") +ValueError: No JSON object could be decoded +>>> json.loads('{"a": .99}') +Traceback (most recent call last): + File "", line 1, in + File "/usr/lib/python3.7/json/__init__.py", line 348, in loads + return _default_decoder.decode(s) + File "/usr/lib/python3.7/json/decoder.py", line 337, in decode + obj, end = self.raw_decode(s, idx=_w(s, 0).end()) + File "/usr/lib/python3.7/json/decoder.py", line 355, in raw_decode + raise JSONDecodeError("Expecting value", s, err.value) from None +json.decoder.JSONDecodeError: Expecting value: line 1 column 7 (char 6) + +``` +`chompjs` library was designed to bypass this limitation, and it allows to scrape such JavaScript objects into proper Python dictionaries: + +``` +>>> import chompjs +>>> +>>> chompjs.parse_js_object("{'a': 'b'}") +{u'a': u'b'} +>>> chompjs.parse_js_object('{a: "b"}') +{u'a': u'b'} +>>> chompjs.parse_js_object('{"a": [1, 2, 3,]}') +{u'a': [1, 2, 3]} +``` + +Internally `chompjs` use a parser written in C to iterate over raw string, fixing its issues along the way. The final result is then passed down to standard library's `json.loads`, ensuring a high speed as compared to full-blown JavaScript parsers such as `demjson`. + +``` +>>> import json +>>> import _chompjs +>>> +>>> _chompjs.parse('{a: 1}') +'{"a":1}' +>>> json.loads(_) +{u'a': 1} +>>> chompjs.parse_js_object('{"a": .99}') +{'a': 0.99} +``` + +# Installation +From PIP: + +```bash +$ python3 -m venv venv +$ . venv/bin/activate +# pip install chompjs +``` +From sources: +```bash +$ git clone https://github.com/Nykakin/chompjs +$ cd chompjs +$ python setup.py build +$ python setup.py install +``` + +To run unittests + +``` +$ python -m unittest +``` + + + + +%package help +Summary: Development documents and examples for chompjs +Provides: python3-chompjs-doc +%description help +# Usage + +`chompjs` can be used in web scrapping for turning JavaScript objects embedded in pages into valid Python dictionaries. + +```python +>>> import chompjs +>>> chompjs.parse_js_object('{"my_data": "test"}') +{u'my_data': u'test'} +``` + +Think of it as a more powerful `json.loads`. For example, it can handle JSON objects containing embedded methods by storing their code in a string: + +```python +>>> import chompjs +>>> js = """ +... var myObj = { +... myMethod: function(params) { +... // ... +... }, +... myValue: 100 +... } +... """ +>>> chompjs.parse_js_object(js, json_params={'strict': False}) +{'myMethod': 'function(params) {\n // ...\n }', 'myValue': 100} +``` + +An example usage with `scrapy`: + +```python +import chompjs +import scrapy + + +class MySpider(scrapy.Spider): + # ... + + def parse(self, response): + script_css = 'script:contains("__NEXT_DATA__")::text' + script_pattern = r'__NEXT_DATA__ = (.*);' + # warning: for some pages you need to pass replace_entities=True + # into re_first to have JSON escaped properly + script_text = response.css(script_css).re_first(script_pattern) + try: + json_data = chompjs.parse_js_object(script_text) + except ValueError: + self.log('Failed to extract data from {}'.format(response.url)) + return + + # work on json_data +``` + +If the input string is not yet escaped and contains a lot of `\\` characters, then `unicode_escape=True` argument might help to sanitize it: + +```python +>>> chompjs.parse_js_object('{\\\"a\\\": 12}', unicode_escape=True) +{u'a': 12} +``` + +`jsonlines=True` can be used to parse JSON Lines: + +```python +>>> chompjs.parse_js_object('[1,2]\n[2,3]\n[3,4]', jsonlines=True) +[[1, 2], [2, 3], [3, 4]] +``` + +By default `chompjs` tries to start with first `{` or `[` character it founds, omitting the rest: + +```python +>>> chompjs.parse_js_object('
...
...
') +[1, 2, 3] +``` + +`json_params` argument can be used to pass options to underlying `json_loads`, such as `strict` or `object_hook`: + +```python +>>> import decimal +>>> import chompjs +>>> chompjs.parse_js_object('[23.2]', json_params={'parse_float': decimal.Decimal}) +[Decimal('23.2')] +``` + +# Rationale + +In web scraping data often is not present directly inside HTML, but instead provided as an embedded JavaScript object that is later used to initialize the page, for example: + +```html + +... + +... + +... + + +``` + +Standard library function `json.loads` is usually sufficient to extract this data: + +```python +>>> # scrapy shell file:///tmp/test.html +>>> import json +>>> script_text = response.css('script:contains(__PRELOADED_STATE__)::text').re_first('__PRELOADED_STATE__=(.*)') +>>> json.loads(script_text) +{u'foo': u'bar'} + +``` +The problem is that not all valid JavaScript objects are also valid JSONs. For example all those strings are valid JavaScript objects but not valid JSONs: + +* `"{'a': 'b'}"` is not a valid JSON because it uses `'` character to quote +* `'{a: "b"}'`is not a valid JSON because property name is not quoted at all +* `'{"a": [1, 2, 3,]}'` is not a valid JSON because there is an extra `,` character at the end of the array +* `'{"a": .99}'` is not a valid JSON because float value lacks a leading 0 + +As a result, `json.loads` fail to extract any of those: + +``` +>>> json.loads("{'a': 'b'}") +Traceback (most recent call last): + File "", line 1, in + File "/usr/lib/python2.7/json/__init__.py", line 339, in loads + return _default_decoder.decode(s) + File "/usr/lib/python2.7/json/decoder.py", line 364, in decode + obj, end = self.raw_decode(s, idx=_w(s, 0).end()) + File "/usr/lib/python2.7/json/decoder.py", line 380, in raw_decode + obj, end = self.scan_once(s, idx) +ValueError: Expecting property name: line 1 column 2 (char 1) +>>> json.loads('{a: "b"}') +Traceback (most recent call last): + File "", line 1, in + File "/usr/lib/python2.7/json/__init__.py", line 339, in loads + return _default_decoder.decode(s) + File "/usr/lib/python2.7/json/decoder.py", line 364, in decode + obj, end = self.raw_decode(s, idx=_w(s, 0).end()) + File "/usr/lib/python2.7/json/decoder.py", line 380, in raw_decode + obj, end = self.scan_once(s, idx) +ValueError: Expecting property name: line 1 column 2 (char 1) +>>> json.loads('{"a": [1, 2, 3,]}') +Traceback (most recent call last): + File "", line 1, in + File "/usr/lib/python2.7/json/__init__.py", line 339, in loads + return _default_decoder.decode(s) + File "/usr/lib/python2.7/json/decoder.py", line 364, in decode + obj, end = self.raw_decode(s, idx=_w(s, 0).end()) + File "/usr/lib/python2.7/json/decoder.py", line 382, in raw_decode + raise ValueError("No JSON object could be decoded") +ValueError: No JSON object could be decoded +>>> json.loads('{"a": .99}') +Traceback (most recent call last): + File "", line 1, in + File "/usr/lib/python3.7/json/__init__.py", line 348, in loads + return _default_decoder.decode(s) + File "/usr/lib/python3.7/json/decoder.py", line 337, in decode + obj, end = self.raw_decode(s, idx=_w(s, 0).end()) + File "/usr/lib/python3.7/json/decoder.py", line 355, in raw_decode + raise JSONDecodeError("Expecting value", s, err.value) from None +json.decoder.JSONDecodeError: Expecting value: line 1 column 7 (char 6) + +``` +`chompjs` library was designed to bypass this limitation, and it allows to scrape such JavaScript objects into proper Python dictionaries: + +``` +>>> import chompjs +>>> +>>> chompjs.parse_js_object("{'a': 'b'}") +{u'a': u'b'} +>>> chompjs.parse_js_object('{a: "b"}') +{u'a': u'b'} +>>> chompjs.parse_js_object('{"a": [1, 2, 3,]}') +{u'a': [1, 2, 3]} +``` + +Internally `chompjs` use a parser written in C to iterate over raw string, fixing its issues along the way. The final result is then passed down to standard library's `json.loads`, ensuring a high speed as compared to full-blown JavaScript parsers such as `demjson`. + +``` +>>> import json +>>> import _chompjs +>>> +>>> _chompjs.parse('{a: 1}') +'{"a":1}' +>>> json.loads(_) +{u'a': 1} +>>> chompjs.parse_js_object('{"a": .99}') +{'a': 0.99} +``` + +# Installation +From PIP: + +```bash +$ python3 -m venv venv +$ . venv/bin/activate +# pip install chompjs +``` +From sources: +```bash +$ git clone https://github.com/Nykakin/chompjs +$ cd chompjs +$ python setup.py build +$ python setup.py install +``` + +To run unittests + +``` +$ python -m unittest +``` + + + + +%prep +%autosetup -n chompjs-1.1.9 + +%build +%py3_build + +%install +%py3_install +install -d -m755 %{buildroot}/%{_pkgdocdir} +if [ -d doc ]; then cp -arf doc %{buildroot}/%{_pkgdocdir}; fi +if [ -d docs ]; then cp -arf docs %{buildroot}/%{_pkgdocdir}; fi +if [ -d example ]; then cp -arf example %{buildroot}/%{_pkgdocdir}; fi +if [ -d examples ]; then cp -arf examples %{buildroot}/%{_pkgdocdir}; fi +pushd %{buildroot} +if [ -d usr/lib ]; then + find usr/lib -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/lib64 ]; then + find usr/lib64 -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/bin ]; then + find usr/bin -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/sbin ]; then + find usr/sbin -type f -printf "/%h/%f\n" >> filelist.lst +fi +touch doclist.lst +if [ -d usr/share/man ]; then + find usr/share/man -type f -printf "/%h/%f.gz\n" >> doclist.lst +fi +popd +mv %{buildroot}/filelist.lst . +mv %{buildroot}/doclist.lst . + +%files -n python3-chompjs -f filelist.lst +%dir %{python3_sitelib}/* + +%files help -f doclist.lst +%{_docdir}/* + +%changelog +* Fri May 05 2023 Python_Bot - 1.1.9-1 +- Package Spec generated diff --git a/sources b/sources new file mode 100644 index 0000000..e54fae1 --- /dev/null +++ b/sources @@ -0,0 +1 @@ +477dfb7c676c95f53ea57d782c1a5679 chompjs-1.1.9.tar.gz -- cgit v1.2.3