From 4d19c12b9d6a33903b4d1058e27ee5f7a1d82b3a Mon Sep 17 00:00:00 2001 From: CoprDistGit Date: Mon, 10 Apr 2023 10:20:27 +0000 Subject: automatic import of python-urlextract --- .gitignore | 1 + python-urlextract.spec | 316 +++++++++++++++++++++++++++++++++++++++++++++++++ sources | 1 + 3 files changed, 318 insertions(+) create mode 100644 python-urlextract.spec create mode 100644 sources diff --git a/.gitignore b/.gitignore index e69de29..147a209 100644 --- a/.gitignore +++ b/.gitignore @@ -0,0 +1 @@ +/urlextract-1.8.0.tar.gz diff --git a/python-urlextract.spec b/python-urlextract.spec new file mode 100644 index 0000000..57d78f3 --- /dev/null +++ b/python-urlextract.spec @@ -0,0 +1,316 @@ +%global _empty_manifest_terminate_build 0 +Name: python-urlextract +Version: 1.8.0 +Release: 1 +Summary: Collects and extracts URLs from given text. +License: MIT +URL: https://github.com/lipoja/URLExtract +Source0: https://mirrors.nju.edu.cn/pypi/web/packages/10/8a/8f849baeab481e054b1db7fd5fff58b8d2c8360e9320bc8d0f174c305e50/urlextract-1.8.0.tar.gz +BuildArch: noarch + +Requires: python3-idna +Requires: python3-uritools +Requires: python3-platformdirs +Requires: python3-filelock + +%description +URLExtract is python class for collecting (extracting) URLs from given +text based on locating TLD. +How does it work +~~~~~~~~~~~~~~~~ +It tries to find any occurrence of TLD in given text. If TLD is found it +starts from that position to expand boundaries to both sides searching +for "stop character" (usually whitespace, comma, single or double +quote). +A dns check option is available to also reject invalid domain names. +NOTE: List of TLDs is downloaded from iana.org to keep you up to date with new TLDs. +Installation +~~~~~~~~~~~~ +Package is available on PyPI - you can install it via pip. + pip install urlextract +Documentation +~~~~~~~~~~~~~ +Online documentation is published at http://urlextract.readthedocs.io/ +Requirements +~~~~~~~~~~~~ +- IDNA for converting links to IDNA format +- uritools for domain name validation +- platformdirs for determining user's cache directory +- dnspython to cache DNS results + pip install idna + pip install uritools + pip install platformdirs + pip install dnspython +Or you can install the requirements with `requirements.txt`: + pip install -r requirements.txt +Run tox +~~~~~~~ +Install tox: + pip install tox +Then run it: + tox +Example +~~~~~~~ +You can look at command line program at the end of *urlextract.py*. +But everything you need to know is this: + from urlextract import URLExtract + extractor = URLExtract() + urls = extractor.find_urls("Text with URLs. Let's have URL janlipovsky.cz as an example.") + print(urls) # prints: ['janlipovsky.cz'] +Or you can get generator over URLs in text by: + from urlextract import URLExtract + extractor = URLExtract() + example_text = "Text with URLs. Let's have URL janlipovsky.cz as an example." + for url in extractor.gen_urls(example_text): + print(url) # prints: ['janlipovsky.cz'] +Or if you want to just check if there is at least one URL you can do: + from urlextract import URLExtract + extractor = URLExtract() + example_text = "Text with URLs. Let's have URL janlipovsky.cz as an example." + if extractor.has_urls(example_text): + print("Given text contains some URL") +If you want to have up to date list of TLDs you can use ``update()``: + from urlextract import URLExtract + extractor = URLExtract() + extractor.update() +or ``update_when_older()`` method: + from urlextract import URLExtract + extractor = URLExtract() + extractor.update_when_older(7) # updates when list is older that 7 days +Known issues +~~~~~~~~~~~~ +Since TLD can be not only shortcut but also some meaningful word we might see "false matches" when we are searching +for URL in some HTML pages. The false match can occur for example in css or JS when you are referring to HTML item +using its classes. +Example HTML code: +

Jan

+ +If this HTML snippet is on the input of ``urlextract.find_urls()`` it will return ``p.bold.name`` as an URL. +Behavior of urlextract is correct, because ``.name`` is valid TLD and urlextract just see that there is ``bold.name`` +valid domain name and ``p`` is valid sub-domain. +License +~~~~~~~ +This piece of code is licensed under The MIT License. + +%package -n python3-urlextract +Summary: Collects and extracts URLs from given text. +Provides: python-urlextract +BuildRequires: python3-devel +BuildRequires: python3-setuptools +BuildRequires: python3-pip +%description -n python3-urlextract +URLExtract is python class for collecting (extracting) URLs from given +text based on locating TLD. +How does it work +~~~~~~~~~~~~~~~~ +It tries to find any occurrence of TLD in given text. If TLD is found it +starts from that position to expand boundaries to both sides searching +for "stop character" (usually whitespace, comma, single or double +quote). +A dns check option is available to also reject invalid domain names. +NOTE: List of TLDs is downloaded from iana.org to keep you up to date with new TLDs. +Installation +~~~~~~~~~~~~ +Package is available on PyPI - you can install it via pip. + pip install urlextract +Documentation +~~~~~~~~~~~~~ +Online documentation is published at http://urlextract.readthedocs.io/ +Requirements +~~~~~~~~~~~~ +- IDNA for converting links to IDNA format +- uritools for domain name validation +- platformdirs for determining user's cache directory +- dnspython to cache DNS results + pip install idna + pip install uritools + pip install platformdirs + pip install dnspython +Or you can install the requirements with `requirements.txt`: + pip install -r requirements.txt +Run tox +~~~~~~~ +Install tox: + pip install tox +Then run it: + tox +Example +~~~~~~~ +You can look at command line program at the end of *urlextract.py*. +But everything you need to know is this: + from urlextract import URLExtract + extractor = URLExtract() + urls = extractor.find_urls("Text with URLs. Let's have URL janlipovsky.cz as an example.") + print(urls) # prints: ['janlipovsky.cz'] +Or you can get generator over URLs in text by: + from urlextract import URLExtract + extractor = URLExtract() + example_text = "Text with URLs. Let's have URL janlipovsky.cz as an example." + for url in extractor.gen_urls(example_text): + print(url) # prints: ['janlipovsky.cz'] +Or if you want to just check if there is at least one URL you can do: + from urlextract import URLExtract + extractor = URLExtract() + example_text = "Text with URLs. Let's have URL janlipovsky.cz as an example." + if extractor.has_urls(example_text): + print("Given text contains some URL") +If you want to have up to date list of TLDs you can use ``update()``: + from urlextract import URLExtract + extractor = URLExtract() + extractor.update() +or ``update_when_older()`` method: + from urlextract import URLExtract + extractor = URLExtract() + extractor.update_when_older(7) # updates when list is older that 7 days +Known issues +~~~~~~~~~~~~ +Since TLD can be not only shortcut but also some meaningful word we might see "false matches" when we are searching +for URL in some HTML pages. The false match can occur for example in css or JS when you are referring to HTML item +using its classes. +Example HTML code: +

Jan

+ +If this HTML snippet is on the input of ``urlextract.find_urls()`` it will return ``p.bold.name`` as an URL. +Behavior of urlextract is correct, because ``.name`` is valid TLD and urlextract just see that there is ``bold.name`` +valid domain name and ``p`` is valid sub-domain. +License +~~~~~~~ +This piece of code is licensed under The MIT License. + +%package help +Summary: Development documents and examples for urlextract +Provides: python3-urlextract-doc +%description help +URLExtract is python class for collecting (extracting) URLs from given +text based on locating TLD. +How does it work +~~~~~~~~~~~~~~~~ +It tries to find any occurrence of TLD in given text. If TLD is found it +starts from that position to expand boundaries to both sides searching +for "stop character" (usually whitespace, comma, single or double +quote). +A dns check option is available to also reject invalid domain names. +NOTE: List of TLDs is downloaded from iana.org to keep you up to date with new TLDs. +Installation +~~~~~~~~~~~~ +Package is available on PyPI - you can install it via pip. + pip install urlextract +Documentation +~~~~~~~~~~~~~ +Online documentation is published at http://urlextract.readthedocs.io/ +Requirements +~~~~~~~~~~~~ +- IDNA for converting links to IDNA format +- uritools for domain name validation +- platformdirs for determining user's cache directory +- dnspython to cache DNS results + pip install idna + pip install uritools + pip install platformdirs + pip install dnspython +Or you can install the requirements with `requirements.txt`: + pip install -r requirements.txt +Run tox +~~~~~~~ +Install tox: + pip install tox +Then run it: + tox +Example +~~~~~~~ +You can look at command line program at the end of *urlextract.py*. +But everything you need to know is this: + from urlextract import URLExtract + extractor = URLExtract() + urls = extractor.find_urls("Text with URLs. Let's have URL janlipovsky.cz as an example.") + print(urls) # prints: ['janlipovsky.cz'] +Or you can get generator over URLs in text by: + from urlextract import URLExtract + extractor = URLExtract() + example_text = "Text with URLs. Let's have URL janlipovsky.cz as an example." + for url in extractor.gen_urls(example_text): + print(url) # prints: ['janlipovsky.cz'] +Or if you want to just check if there is at least one URL you can do: + from urlextract import URLExtract + extractor = URLExtract() + example_text = "Text with URLs. Let's have URL janlipovsky.cz as an example." + if extractor.has_urls(example_text): + print("Given text contains some URL") +If you want to have up to date list of TLDs you can use ``update()``: + from urlextract import URLExtract + extractor = URLExtract() + extractor.update() +or ``update_when_older()`` method: + from urlextract import URLExtract + extractor = URLExtract() + extractor.update_when_older(7) # updates when list is older that 7 days +Known issues +~~~~~~~~~~~~ +Since TLD can be not only shortcut but also some meaningful word we might see "false matches" when we are searching +for URL in some HTML pages. The false match can occur for example in css or JS when you are referring to HTML item +using its classes. +Example HTML code: +

Jan

+ +If this HTML snippet is on the input of ``urlextract.find_urls()`` it will return ``p.bold.name`` as an URL. +Behavior of urlextract is correct, because ``.name`` is valid TLD and urlextract just see that there is ``bold.name`` +valid domain name and ``p`` is valid sub-domain. +License +~~~~~~~ +This piece of code is licensed under The MIT License. + +%prep +%autosetup -n urlextract-1.8.0 + +%build +%py3_build + +%install +%py3_install +install -d -m755 %{buildroot}/%{_pkgdocdir} +if [ -d doc ]; then cp -arf doc %{buildroot}/%{_pkgdocdir}; fi +if [ -d docs ]; then cp -arf docs %{buildroot}/%{_pkgdocdir}; fi +if [ -d example ]; then cp -arf example %{buildroot}/%{_pkgdocdir}; fi +if [ -d examples ]; then cp -arf examples %{buildroot}/%{_pkgdocdir}; fi +pushd %{buildroot} +if [ -d usr/lib ]; then + find usr/lib -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/lib64 ]; then + find usr/lib64 -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/bin ]; then + find usr/bin -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/sbin ]; then + find usr/sbin -type f -printf "/%h/%f\n" >> filelist.lst +fi +touch doclist.lst +if [ -d usr/share/man ]; then + find usr/share/man -type f -printf "/%h/%f.gz\n" >> doclist.lst +fi +popd +mv %{buildroot}/filelist.lst . +mv %{buildroot}/doclist.lst . + +%files -n python3-urlextract -f filelist.lst +%dir %{python3_sitelib}/* + +%files help -f doclist.lst +%{_docdir}/* + +%changelog +* Mon Apr 10 2023 Python_Bot - 1.8.0-1 +- Package Spec generated diff --git a/sources b/sources new file mode 100644 index 0000000..8f3fae2 --- /dev/null +++ b/sources @@ -0,0 +1 @@ +1eecd621aaa194274ec58468e6428274 urlextract-1.8.0.tar.gz -- cgit v1.2.3