diff options
author | CoprDistGit <infra@openeuler.org> | 2023-04-10 10:20:27 +0000 |
---|---|---|
committer | CoprDistGit <infra@openeuler.org> | 2023-04-10 10:20:27 +0000 |
commit | 4d19c12b9d6a33903b4d1058e27ee5f7a1d82b3a (patch) | |
tree | 5c77ee445be814c1ee7a47ea3550f99256c14cea | |
parent | c8f26ad49602f86fc0e4957bef4a47f1f6d1d623 (diff) |
automatic import of python-urlextract
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | python-urlextract.spec | 316 | ||||
-rw-r--r-- | sources | 1 |
3 files changed, 318 insertions, 0 deletions
@@ -0,0 +1 @@ +/urlextract-1.8.0.tar.gz diff --git a/python-urlextract.spec b/python-urlextract.spec new file mode 100644 index 0000000..57d78f3 --- /dev/null +++ b/python-urlextract.spec @@ -0,0 +1,316 @@ +%global _empty_manifest_terminate_build 0 +Name: python-urlextract +Version: 1.8.0 +Release: 1 +Summary: Collects and extracts URLs from given text. +License: MIT +URL: https://github.com/lipoja/URLExtract +Source0: https://mirrors.nju.edu.cn/pypi/web/packages/10/8a/8f849baeab481e054b1db7fd5fff58b8d2c8360e9320bc8d0f174c305e50/urlextract-1.8.0.tar.gz +BuildArch: noarch + +Requires: python3-idna +Requires: python3-uritools +Requires: python3-platformdirs +Requires: python3-filelock + +%description +URLExtract is python class for collecting (extracting) URLs from given +text based on locating TLD. +How does it work +~~~~~~~~~~~~~~~~ +It tries to find any occurrence of TLD in given text. If TLD is found it +starts from that position to expand boundaries to both sides searching +for "stop character" (usually whitespace, comma, single or double +quote). +A dns check option is available to also reject invalid domain names. +NOTE: List of TLDs is downloaded from iana.org to keep you up to date with new TLDs. +Installation +~~~~~~~~~~~~ +Package is available on PyPI - you can install it via pip. + pip install urlextract +Documentation +~~~~~~~~~~~~~ +Online documentation is published at http://urlextract.readthedocs.io/ +Requirements +~~~~~~~~~~~~ +- IDNA for converting links to IDNA format +- uritools for domain name validation +- platformdirs for determining user's cache directory +- dnspython to cache DNS results + pip install idna + pip install uritools + pip install platformdirs + pip install dnspython +Or you can install the requirements with `requirements.txt`: + pip install -r requirements.txt +Run tox +~~~~~~~ +Install tox: + pip install tox +Then run it: + tox +Example +~~~~~~~ +You can look at command line program at the end of *urlextract.py*. +But everything you need to know is this: + from urlextract import URLExtract + extractor = URLExtract() + urls = extractor.find_urls("Text with URLs. Let's have URL janlipovsky.cz as an example.") + print(urls) # prints: ['janlipovsky.cz'] +Or you can get generator over URLs in text by: + from urlextract import URLExtract + extractor = URLExtract() + example_text = "Text with URLs. Let's have URL janlipovsky.cz as an example." + for url in extractor.gen_urls(example_text): + print(url) # prints: ['janlipovsky.cz'] +Or if you want to just check if there is at least one URL you can do: + from urlextract import URLExtract + extractor = URLExtract() + example_text = "Text with URLs. Let's have URL janlipovsky.cz as an example." + if extractor.has_urls(example_text): + print("Given text contains some URL") +If you want to have up to date list of TLDs you can use ``update()``: + from urlextract import URLExtract + extractor = URLExtract() + extractor.update() +or ``update_when_older()`` method: + from urlextract import URLExtract + extractor = URLExtract() + extractor.update_when_older(7) # updates when list is older that 7 days +Known issues +~~~~~~~~~~~~ +Since TLD can be not only shortcut but also some meaningful word we might see "false matches" when we are searching +for URL in some HTML pages. The false match can occur for example in css or JS when you are referring to HTML item +using its classes. +Example HTML code: + <p class="bold name">Jan</p> + <style> + p.bold.name { + font-weight: bold; + } + </style> +If this HTML snippet is on the input of ``urlextract.find_urls()`` it will return ``p.bold.name`` as an URL. +Behavior of urlextract is correct, because ``.name`` is valid TLD and urlextract just see that there is ``bold.name`` +valid domain name and ``p`` is valid sub-domain. +License +~~~~~~~ +This piece of code is licensed under The MIT License. + +%package -n python3-urlextract +Summary: Collects and extracts URLs from given text. +Provides: python-urlextract +BuildRequires: python3-devel +BuildRequires: python3-setuptools +BuildRequires: python3-pip +%description -n python3-urlextract +URLExtract is python class for collecting (extracting) URLs from given +text based on locating TLD. +How does it work +~~~~~~~~~~~~~~~~ +It tries to find any occurrence of TLD in given text. If TLD is found it +starts from that position to expand boundaries to both sides searching +for "stop character" (usually whitespace, comma, single or double +quote). +A dns check option is available to also reject invalid domain names. +NOTE: List of TLDs is downloaded from iana.org to keep you up to date with new TLDs. +Installation +~~~~~~~~~~~~ +Package is available on PyPI - you can install it via pip. + pip install urlextract +Documentation +~~~~~~~~~~~~~ +Online documentation is published at http://urlextract.readthedocs.io/ +Requirements +~~~~~~~~~~~~ +- IDNA for converting links to IDNA format +- uritools for domain name validation +- platformdirs for determining user's cache directory +- dnspython to cache DNS results + pip install idna + pip install uritools + pip install platformdirs + pip install dnspython +Or you can install the requirements with `requirements.txt`: + pip install -r requirements.txt +Run tox +~~~~~~~ +Install tox: + pip install tox +Then run it: + tox +Example +~~~~~~~ +You can look at command line program at the end of *urlextract.py*. +But everything you need to know is this: + from urlextract import URLExtract + extractor = URLExtract() + urls = extractor.find_urls("Text with URLs. Let's have URL janlipovsky.cz as an example.") + print(urls) # prints: ['janlipovsky.cz'] +Or you can get generator over URLs in text by: + from urlextract import URLExtract + extractor = URLExtract() + example_text = "Text with URLs. Let's have URL janlipovsky.cz as an example." + for url in extractor.gen_urls(example_text): + print(url) # prints: ['janlipovsky.cz'] +Or if you want to just check if there is at least one URL you can do: + from urlextract import URLExtract + extractor = URLExtract() + example_text = "Text with URLs. Let's have URL janlipovsky.cz as an example." + if extractor.has_urls(example_text): + print("Given text contains some URL") +If you want to have up to date list of TLDs you can use ``update()``: + from urlextract import URLExtract + extractor = URLExtract() + extractor.update() +or ``update_when_older()`` method: + from urlextract import URLExtract + extractor = URLExtract() + extractor.update_when_older(7) # updates when list is older that 7 days +Known issues +~~~~~~~~~~~~ +Since TLD can be not only shortcut but also some meaningful word we might see "false matches" when we are searching +for URL in some HTML pages. The false match can occur for example in css or JS when you are referring to HTML item +using its classes. +Example HTML code: + <p class="bold name">Jan</p> + <style> + p.bold.name { + font-weight: bold; + } + </style> +If this HTML snippet is on the input of ``urlextract.find_urls()`` it will return ``p.bold.name`` as an URL. +Behavior of urlextract is correct, because ``.name`` is valid TLD and urlextract just see that there is ``bold.name`` +valid domain name and ``p`` is valid sub-domain. +License +~~~~~~~ +This piece of code is licensed under The MIT License. + +%package help +Summary: Development documents and examples for urlextract +Provides: python3-urlextract-doc +%description help +URLExtract is python class for collecting (extracting) URLs from given +text based on locating TLD. +How does it work +~~~~~~~~~~~~~~~~ +It tries to find any occurrence of TLD in given text. If TLD is found it +starts from that position to expand boundaries to both sides searching +for "stop character" (usually whitespace, comma, single or double +quote). +A dns check option is available to also reject invalid domain names. +NOTE: List of TLDs is downloaded from iana.org to keep you up to date with new TLDs. +Installation +~~~~~~~~~~~~ +Package is available on PyPI - you can install it via pip. + pip install urlextract +Documentation +~~~~~~~~~~~~~ +Online documentation is published at http://urlextract.readthedocs.io/ +Requirements +~~~~~~~~~~~~ +- IDNA for converting links to IDNA format +- uritools for domain name validation +- platformdirs for determining user's cache directory +- dnspython to cache DNS results + pip install idna + pip install uritools + pip install platformdirs + pip install dnspython +Or you can install the requirements with `requirements.txt`: + pip install -r requirements.txt +Run tox +~~~~~~~ +Install tox: + pip install tox +Then run it: + tox +Example +~~~~~~~ +You can look at command line program at the end of *urlextract.py*. +But everything you need to know is this: + from urlextract import URLExtract + extractor = URLExtract() + urls = extractor.find_urls("Text with URLs. Let's have URL janlipovsky.cz as an example.") + print(urls) # prints: ['janlipovsky.cz'] +Or you can get generator over URLs in text by: + from urlextract import URLExtract + extractor = URLExtract() + example_text = "Text with URLs. Let's have URL janlipovsky.cz as an example." + for url in extractor.gen_urls(example_text): + print(url) # prints: ['janlipovsky.cz'] +Or if you want to just check if there is at least one URL you can do: + from urlextract import URLExtract + extractor = URLExtract() + example_text = "Text with URLs. Let's have URL janlipovsky.cz as an example." + if extractor.has_urls(example_text): + print("Given text contains some URL") +If you want to have up to date list of TLDs you can use ``update()``: + from urlextract import URLExtract + extractor = URLExtract() + extractor.update() +or ``update_when_older()`` method: + from urlextract import URLExtract + extractor = URLExtract() + extractor.update_when_older(7) # updates when list is older that 7 days +Known issues +~~~~~~~~~~~~ +Since TLD can be not only shortcut but also some meaningful word we might see "false matches" when we are searching +for URL in some HTML pages. The false match can occur for example in css or JS when you are referring to HTML item +using its classes. +Example HTML code: + <p class="bold name">Jan</p> + <style> + p.bold.name { + font-weight: bold; + } + </style> +If this HTML snippet is on the input of ``urlextract.find_urls()`` it will return ``p.bold.name`` as an URL. +Behavior of urlextract is correct, because ``.name`` is valid TLD and urlextract just see that there is ``bold.name`` +valid domain name and ``p`` is valid sub-domain. +License +~~~~~~~ +This piece of code is licensed under The MIT License. + +%prep +%autosetup -n urlextract-1.8.0 + +%build +%py3_build + +%install +%py3_install +install -d -m755 %{buildroot}/%{_pkgdocdir} +if [ -d doc ]; then cp -arf doc %{buildroot}/%{_pkgdocdir}; fi +if [ -d docs ]; then cp -arf docs %{buildroot}/%{_pkgdocdir}; fi +if [ -d example ]; then cp -arf example %{buildroot}/%{_pkgdocdir}; fi +if [ -d examples ]; then cp -arf examples %{buildroot}/%{_pkgdocdir}; fi +pushd %{buildroot} +if [ -d usr/lib ]; then + find usr/lib -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/lib64 ]; then + find usr/lib64 -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/bin ]; then + find usr/bin -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/sbin ]; then + find usr/sbin -type f -printf "/%h/%f\n" >> filelist.lst +fi +touch doclist.lst +if [ -d usr/share/man ]; then + find usr/share/man -type f -printf "/%h/%f.gz\n" >> doclist.lst +fi +popd +mv %{buildroot}/filelist.lst . +mv %{buildroot}/doclist.lst . + +%files -n python3-urlextract -f filelist.lst +%dir %{python3_sitelib}/* + +%files help -f doclist.lst +%{_docdir}/* + +%changelog +* Mon Apr 10 2023 Python_Bot <Python_Bot@openeuler.org> - 1.8.0-1 +- Package Spec generated @@ -0,0 +1 @@ +1eecd621aaa194274ec58468e6428274 urlextract-1.8.0.tar.gz |