From 1c95716eaa0b77a4329bb34afe6d6c171b6e9f78 Mon Sep 17 00:00:00 2001 From: CoprDistGit Date: Tue, 11 Apr 2023 15:29:52 +0000 Subject: automatic import of python-clean-text --- python-clean-text.spec | 521 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 521 insertions(+) create mode 100644 python-clean-text.spec (limited to 'python-clean-text.spec') diff --git a/python-clean-text.spec b/python-clean-text.spec new file mode 100644 index 0000000..169eb97 --- /dev/null +++ b/python-clean-text.spec @@ -0,0 +1,521 @@ +%global _empty_manifest_terminate_build 0 +Name: python-clean-text +Version: 0.6.0 +Release: 1 +Summary: Functions to preprocess and normalize text. +License: Apache-2.0 +URL: https://pypi.org/project/clean-text/ +Source0: https://mirrors.nju.edu.cn/pypi/web/packages/c3/5c/3151736165b123611351c103908f24841d88df0dfe455ece15b2657adeae/clean-text-0.6.0.tar.gz +BuildArch: noarch + +Requires: python3-emoji +Requires: python3-ftfy +Requires: python3-pandas +Requires: python3-scikit-learn +Requires: python3-unidecode + +%description +# `clean-text` [![Build Status](https://img.shields.io/github/workflow/status/jfilter/clean-text/Test)](https://github.com/jfilter/clean-text/actions/workflows/test.yml) [![PyPI](https://img.shields.io/pypi/v/clean-text.svg)](https://pypi.org/project/clean-text/) [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/clean-text.svg)](https://pypi.org/project/clean-text/) [![PyPI - Downloads](https://img.shields.io/pypi/dm/clean-text)](https://pypistats.org/packages/clean-text) + +User-generated content on the Web and in social media is often dirty. Preprocess your scraped data with `clean-text` to create a normalized text representation. For instance, turn this corrupted input: + +```txt +A bunch of \\u2018new\\u2019 references, including [Moana](https://en.wikipedia.org/wiki/Moana_%282016_film%29). + + +»Yóù àré rïght <3!« +``` + +into this clean output: + +```txt +A bunch of 'new' references, including [moana](). + +"you are right <3!" +``` + +`clean-text` uses [ftfy](https://github.com/LuminosoInsight/python-ftfy), [unidecode](https://github.com/takluyver/Unidecode) and numerous hand-crafted rules, i.e., RegEx. + +## Installation + +To install the GPL-licensed package [unidecode](https://github.com/takluyver/Unidecode) alongside: + +```bash +pip install clean-text[gpl] +``` + +You may want to abstain from GPL: + +```bash +pip install clean-text +``` + +NB: This package is named `clean-text` and not `cleantext`. + +If [unidecode](https://github.com/takluyver/Unidecode) is not available, `clean-text` will resort to Python's [unicodedata.normalize](https://docs.python.org/3.7/library/unicodedata.html#unicodedata.normalize) for [transliteration](https://en.wikipedia.org/wiki/Transliteration). +Transliteration to closest ASCII symbols involes manually mappings, i.e., `ê` to `e`. +`unidecode`'s mapping is superiour but unicodedata's are sufficent. +However, you may want to disable this feature altogether depending on your data and use case. + +To make it clear: There are **inconsistencies** between processing text with or without `unidecode`. + +## Usage + +```python +from cleantext import clean + +clean("some input", + fix_unicode=True, # fix various unicode errors + to_ascii=True, # transliterate to closest ASCII representation + lower=True, # lowercase text + no_line_breaks=False, # fully strip line breaks as opposed to only normalizing them + no_urls=False, # replace all URLs with a special token + no_emails=False, # replace all email addresses with a special token + no_phone_numbers=False, # replace all phone numbers with a special token + no_numbers=False, # replace all numbers with a special token + no_digits=False, # replace all digits with a special token + no_currency_symbols=False, # replace all currency symbols with a special token + no_punct=False, # remove punctuations + replace_with_punct="", # instead of removing punctuations you may replace them + replace_with_url="", + replace_with_email="", + replace_with_phone_number="", + replace_with_number="", + replace_with_digit="0", + replace_with_currency_symbol="", + lang="en" # set to 'de' for German special handling +) +``` + +Carefully choose the arguments that fit your task. The default parameters are listed above. + +You may also only use specific functions for cleaning. For this, take a look at the [source code](https://github.com/jfilter/clean-text/blob/main/cleantext/clean.py). + +### Supported languages + +So far, only English and German are fully supported. +It should work for the majority of western languages. +If you need some special handling for your language, feel free to contribute. 🙃 + +### Using `clean-text` with `scikit-learn` + +There is also **scikit-learn** compatible API to use in your pipelines. +All of the parameters above work here as well. + +```bash +pip install clean-text[gpl,sklearn] +pip install clean-text[sklearn] +``` + +```python +from cleantext.sklearn import CleanTransformer + +cleaner = CleanTransformer(no_punct=False, lower=False) + +cleaner.transform(['Happily clean your text!', 'Another Input']) +``` + +## Development + +[Use poetry.](https://python-poetry.org/) + +## Contributing + +If you have a **question**, found a **bug** or want to propose a new **feature**, have a look at the [issues page](https://github.com/jfilter/clean-text/issues). + +**Pull requests** are especially welcomed when they fix bugs or improve the code quality. + +If you don't like the output of `clean-text`, consider adding a [test](https://github.com/jfilter/clean-text/tree/main/tests) with your specific input and desired output. + +## Related Work + +### Generic text cleaning packages + +- https://github.com/pudo/normality +- https://github.com/davidmogar/cucco +- https://github.com/lyeoni/prenlp +- https://github.com/s/preprocessor +- https://github.com/artefactory/NLPretext +- https://github.com/cbaziotis/ekphrasis + +### Full-blown NLP libraries with some text cleaning + +- https://github.com/chartbeat-labs/textacy +- https://github.com/jbesomi/texthero + +### Remove or replace strings + +- https://github.com/vi3k6i5/flashtext +- https://github.com/ddelange/retrie + +### Detect dates + +- https://github.com/scrapinghub/dateparser + +### Clean massive Common Crawl data + +- https://github.com/facebookresearch/cc_net + +## Acknowledgements + +Built upon the work by [Burton DeWilde](https://github.com/bdewilde) for [Textacy](https://github.com/chartbeat-labs/textacy). + +## License + +Apache + + +%package -n python3-clean-text +Summary: Functions to preprocess and normalize text. +Provides: python-clean-text +BuildRequires: python3-devel +BuildRequires: python3-setuptools +BuildRequires: python3-pip +%description -n python3-clean-text +# `clean-text` [![Build Status](https://img.shields.io/github/workflow/status/jfilter/clean-text/Test)](https://github.com/jfilter/clean-text/actions/workflows/test.yml) [![PyPI](https://img.shields.io/pypi/v/clean-text.svg)](https://pypi.org/project/clean-text/) [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/clean-text.svg)](https://pypi.org/project/clean-text/) [![PyPI - Downloads](https://img.shields.io/pypi/dm/clean-text)](https://pypistats.org/packages/clean-text) + +User-generated content on the Web and in social media is often dirty. Preprocess your scraped data with `clean-text` to create a normalized text representation. For instance, turn this corrupted input: + +```txt +A bunch of \\u2018new\\u2019 references, including [Moana](https://en.wikipedia.org/wiki/Moana_%282016_film%29). + + +»Yóù àré rïght <3!« +``` + +into this clean output: + +```txt +A bunch of 'new' references, including [moana](). + +"you are right <3!" +``` + +`clean-text` uses [ftfy](https://github.com/LuminosoInsight/python-ftfy), [unidecode](https://github.com/takluyver/Unidecode) and numerous hand-crafted rules, i.e., RegEx. + +## Installation + +To install the GPL-licensed package [unidecode](https://github.com/takluyver/Unidecode) alongside: + +```bash +pip install clean-text[gpl] +``` + +You may want to abstain from GPL: + +```bash +pip install clean-text +``` + +NB: This package is named `clean-text` and not `cleantext`. + +If [unidecode](https://github.com/takluyver/Unidecode) is not available, `clean-text` will resort to Python's [unicodedata.normalize](https://docs.python.org/3.7/library/unicodedata.html#unicodedata.normalize) for [transliteration](https://en.wikipedia.org/wiki/Transliteration). +Transliteration to closest ASCII symbols involes manually mappings, i.e., `ê` to `e`. +`unidecode`'s mapping is superiour but unicodedata's are sufficent. +However, you may want to disable this feature altogether depending on your data and use case. + +To make it clear: There are **inconsistencies** between processing text with or without `unidecode`. + +## Usage + +```python +from cleantext import clean + +clean("some input", + fix_unicode=True, # fix various unicode errors + to_ascii=True, # transliterate to closest ASCII representation + lower=True, # lowercase text + no_line_breaks=False, # fully strip line breaks as opposed to only normalizing them + no_urls=False, # replace all URLs with a special token + no_emails=False, # replace all email addresses with a special token + no_phone_numbers=False, # replace all phone numbers with a special token + no_numbers=False, # replace all numbers with a special token + no_digits=False, # replace all digits with a special token + no_currency_symbols=False, # replace all currency symbols with a special token + no_punct=False, # remove punctuations + replace_with_punct="", # instead of removing punctuations you may replace them + replace_with_url="", + replace_with_email="", + replace_with_phone_number="", + replace_with_number="", + replace_with_digit="0", + replace_with_currency_symbol="", + lang="en" # set to 'de' for German special handling +) +``` + +Carefully choose the arguments that fit your task. The default parameters are listed above. + +You may also only use specific functions for cleaning. For this, take a look at the [source code](https://github.com/jfilter/clean-text/blob/main/cleantext/clean.py). + +### Supported languages + +So far, only English and German are fully supported. +It should work for the majority of western languages. +If you need some special handling for your language, feel free to contribute. 🙃 + +### Using `clean-text` with `scikit-learn` + +There is also **scikit-learn** compatible API to use in your pipelines. +All of the parameters above work here as well. + +```bash +pip install clean-text[gpl,sklearn] +pip install clean-text[sklearn] +``` + +```python +from cleantext.sklearn import CleanTransformer + +cleaner = CleanTransformer(no_punct=False, lower=False) + +cleaner.transform(['Happily clean your text!', 'Another Input']) +``` + +## Development + +[Use poetry.](https://python-poetry.org/) + +## Contributing + +If you have a **question**, found a **bug** or want to propose a new **feature**, have a look at the [issues page](https://github.com/jfilter/clean-text/issues). + +**Pull requests** are especially welcomed when they fix bugs or improve the code quality. + +If you don't like the output of `clean-text`, consider adding a [test](https://github.com/jfilter/clean-text/tree/main/tests) with your specific input and desired output. + +## Related Work + +### Generic text cleaning packages + +- https://github.com/pudo/normality +- https://github.com/davidmogar/cucco +- https://github.com/lyeoni/prenlp +- https://github.com/s/preprocessor +- https://github.com/artefactory/NLPretext +- https://github.com/cbaziotis/ekphrasis + +### Full-blown NLP libraries with some text cleaning + +- https://github.com/chartbeat-labs/textacy +- https://github.com/jbesomi/texthero + +### Remove or replace strings + +- https://github.com/vi3k6i5/flashtext +- https://github.com/ddelange/retrie + +### Detect dates + +- https://github.com/scrapinghub/dateparser + +### Clean massive Common Crawl data + +- https://github.com/facebookresearch/cc_net + +## Acknowledgements + +Built upon the work by [Burton DeWilde](https://github.com/bdewilde) for [Textacy](https://github.com/chartbeat-labs/textacy). + +## License + +Apache + + +%package help +Summary: Development documents and examples for clean-text +Provides: python3-clean-text-doc +%description help +# `clean-text` [![Build Status](https://img.shields.io/github/workflow/status/jfilter/clean-text/Test)](https://github.com/jfilter/clean-text/actions/workflows/test.yml) [![PyPI](https://img.shields.io/pypi/v/clean-text.svg)](https://pypi.org/project/clean-text/) [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/clean-text.svg)](https://pypi.org/project/clean-text/) [![PyPI - Downloads](https://img.shields.io/pypi/dm/clean-text)](https://pypistats.org/packages/clean-text) + +User-generated content on the Web and in social media is often dirty. Preprocess your scraped data with `clean-text` to create a normalized text representation. For instance, turn this corrupted input: + +```txt +A bunch of \\u2018new\\u2019 references, including [Moana](https://en.wikipedia.org/wiki/Moana_%282016_film%29). + + +»Yóù àré rïght <3!« +``` + +into this clean output: + +```txt +A bunch of 'new' references, including [moana](). + +"you are right <3!" +``` + +`clean-text` uses [ftfy](https://github.com/LuminosoInsight/python-ftfy), [unidecode](https://github.com/takluyver/Unidecode) and numerous hand-crafted rules, i.e., RegEx. + +## Installation + +To install the GPL-licensed package [unidecode](https://github.com/takluyver/Unidecode) alongside: + +```bash +pip install clean-text[gpl] +``` + +You may want to abstain from GPL: + +```bash +pip install clean-text +``` + +NB: This package is named `clean-text` and not `cleantext`. + +If [unidecode](https://github.com/takluyver/Unidecode) is not available, `clean-text` will resort to Python's [unicodedata.normalize](https://docs.python.org/3.7/library/unicodedata.html#unicodedata.normalize) for [transliteration](https://en.wikipedia.org/wiki/Transliteration). +Transliteration to closest ASCII symbols involes manually mappings, i.e., `ê` to `e`. +`unidecode`'s mapping is superiour but unicodedata's are sufficent. +However, you may want to disable this feature altogether depending on your data and use case. + +To make it clear: There are **inconsistencies** between processing text with or without `unidecode`. + +## Usage + +```python +from cleantext import clean + +clean("some input", + fix_unicode=True, # fix various unicode errors + to_ascii=True, # transliterate to closest ASCII representation + lower=True, # lowercase text + no_line_breaks=False, # fully strip line breaks as opposed to only normalizing them + no_urls=False, # replace all URLs with a special token + no_emails=False, # replace all email addresses with a special token + no_phone_numbers=False, # replace all phone numbers with a special token + no_numbers=False, # replace all numbers with a special token + no_digits=False, # replace all digits with a special token + no_currency_symbols=False, # replace all currency symbols with a special token + no_punct=False, # remove punctuations + replace_with_punct="", # instead of removing punctuations you may replace them + replace_with_url="", + replace_with_email="", + replace_with_phone_number="", + replace_with_number="", + replace_with_digit="0", + replace_with_currency_symbol="", + lang="en" # set to 'de' for German special handling +) +``` + +Carefully choose the arguments that fit your task. The default parameters are listed above. + +You may also only use specific functions for cleaning. For this, take a look at the [source code](https://github.com/jfilter/clean-text/blob/main/cleantext/clean.py). + +### Supported languages + +So far, only English and German are fully supported. +It should work for the majority of western languages. +If you need some special handling for your language, feel free to contribute. 🙃 + +### Using `clean-text` with `scikit-learn` + +There is also **scikit-learn** compatible API to use in your pipelines. +All of the parameters above work here as well. + +```bash +pip install clean-text[gpl,sklearn] +pip install clean-text[sklearn] +``` + +```python +from cleantext.sklearn import CleanTransformer + +cleaner = CleanTransformer(no_punct=False, lower=False) + +cleaner.transform(['Happily clean your text!', 'Another Input']) +``` + +## Development + +[Use poetry.](https://python-poetry.org/) + +## Contributing + +If you have a **question**, found a **bug** or want to propose a new **feature**, have a look at the [issues page](https://github.com/jfilter/clean-text/issues). + +**Pull requests** are especially welcomed when they fix bugs or improve the code quality. + +If you don't like the output of `clean-text`, consider adding a [test](https://github.com/jfilter/clean-text/tree/main/tests) with your specific input and desired output. + +## Related Work + +### Generic text cleaning packages + +- https://github.com/pudo/normality +- https://github.com/davidmogar/cucco +- https://github.com/lyeoni/prenlp +- https://github.com/s/preprocessor +- https://github.com/artefactory/NLPretext +- https://github.com/cbaziotis/ekphrasis + +### Full-blown NLP libraries with some text cleaning + +- https://github.com/chartbeat-labs/textacy +- https://github.com/jbesomi/texthero + +### Remove or replace strings + +- https://github.com/vi3k6i5/flashtext +- https://github.com/ddelange/retrie + +### Detect dates + +- https://github.com/scrapinghub/dateparser + +### Clean massive Common Crawl data + +- https://github.com/facebookresearch/cc_net + +## Acknowledgements + +Built upon the work by [Burton DeWilde](https://github.com/bdewilde) for [Textacy](https://github.com/chartbeat-labs/textacy). + +## License + +Apache + + +%prep +%autosetup -n clean-text-0.6.0 + +%build +%py3_build + +%install +%py3_install +install -d -m755 %{buildroot}/%{_pkgdocdir} +if [ -d doc ]; then cp -arf doc %{buildroot}/%{_pkgdocdir}; fi +if [ -d docs ]; then cp -arf docs %{buildroot}/%{_pkgdocdir}; fi +if [ -d example ]; then cp -arf example %{buildroot}/%{_pkgdocdir}; fi +if [ -d examples ]; then cp -arf examples %{buildroot}/%{_pkgdocdir}; fi +pushd %{buildroot} +if [ -d usr/lib ]; then + find usr/lib -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/lib64 ]; then + find usr/lib64 -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/bin ]; then + find usr/bin -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/sbin ]; then + find usr/sbin -type f -printf "/%h/%f\n" >> filelist.lst +fi +touch doclist.lst +if [ -d usr/share/man ]; then + find usr/share/man -type f -printf "/%h/%f.gz\n" >> doclist.lst +fi +popd +mv %{buildroot}/filelist.lst . +mv %{buildroot}/doclist.lst . + +%files -n python3-clean-text -f filelist.lst +%dir %{python3_sitelib}/* + +%files help -f doclist.lst +%{_docdir}/* + +%changelog +* Tue Apr 11 2023 Python_Bot - 0.6.0-1 +- Package Spec generated -- cgit v1.2.3