summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCoprDistGit <infra@openeuler.org>2024-01-29 04:08:54 +0000
committerCoprDistGit <infra@openeuler.org>2024-01-29 04:08:54 +0000
commit0d896a9d0e9e6bd73b4fda5c18f9b9012fe54744 (patch)
tree4ba32cc7883ed1d644ac47e551fad988e632e2ab
parentba187cb14d6a52c0cbca519475f8c7520651fc6d (diff)
automatic import of python-tokenizersopeneuler23.09
-rw-r--r--.gitignore1
-rw-r--r--python-tokenizers.spec71
-rw-r--r--sources1
3 files changed, 73 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
index e69de29..ab4b740 100644
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1 @@
+/v0.15.1.tar.gz
diff --git a/python-tokenizers.spec b/python-tokenizers.spec
new file mode 100644
index 0000000..02bc89f
--- /dev/null
+++ b/python-tokenizers.spec
@@ -0,0 +1,71 @@
+%global debug_package %{nil}
+%global _empty_manifest_terminate_build 0
+
+Name: python-tokenizers
+Version: 0.15.1
+Release: 1
+Summary: Fast State-of-the-Art Tokenizers optimized for Research and Production
+License: Apache-2.0
+URL: https://github.com/huggingface/tokenizers
+Source0: https://github.com/huggingface/tokenizers/archive/refs/tags/v%{version}.tar.gz
+
+Requires: python3-numpy
+Requires: python3-pytorch
+
+%description
+A Tokenizer works as a pipeline, it processes some raw text as input and outputs an Encoding. The various steps of the pipeline are:
+The Normalizer: in charge of normalizing the text. Common examples of normalization are the unicode normalization
+ standards, such as NFD or NFKC. More details about how to use the Normalizers are available on the Hugging Face blog
+The PreTokenizer: in charge of creating initial words splits in the text. The most common way of splitting text is simply on whitespace.
+The Model: in charge of doing the actual tokenization. An example of a Model would be BPE or WordPiece.
+The PostProcessor: in charge of post-processing the Encoding to add anything relevant that, for example,
+ a language model would need, such as special tokens.
+
+%package -n python3-tokenizers
+Summary: Fast State-of-the-Art Tokenizers optimized for Research and Production
+Provides: python3-tokenizers
+BuildRequires: python3-devel
+BuildRequires: python3-setuptools
+BuildRequires: python3-setuptools_scm
+BuildRequires: python3-pbr
+BuildRequires: python3-pip
+BuildRequires: python3-wheel
+BuildRequires: python3-hatchling
+
+BuildRequires: rust cargo
+BuildRequires: python3-maturin
+BuildRequires: python3-setuptools-rust
+
+%description -n python3-tokenizers
+A Tokenizer works as a pipeline, it processes some raw text as input and outputs an Encoding. The various steps of the pipeline are:
+The Normalizer: in charge of normalizing the text. Common examples of normalization are the unicode normalization
+ standards, such as NFD or NFKC. More details about how to use the Normalizers are available on the Hugging Face blog
+The PreTokenizer: in charge of creating initial words splits in the text. The most common way of splitting text is simply on whitespace.
+The Model: in charge of doing the actual tokenization. An example of a Model would be BPE or WordPiece.
+The PostProcessor: in charge of post-processing the Encoding to add anything relevant that, for example,
+ a language model would need, such as special tokens.
+
+%prep
+%autosetup -p1 -n tokenizers-%{version}
+
+%build
+pushd ./bindings/python
+%pyproject_build
+popd
+install -d -m755 %{buildroot}/%{_pkgdocdir}
+if [ -d docs ]; then cp -arf docs %{buildroot}/%{_pkgdocdir}; fi
+if [ -d examples ]; then cp -arf docs %{buildroot}/%{_pkgdocdir}; fi
+
+%install
+pushd ./bindings/python
+%pyproject_install
+popd
+
+%files -n python3-tokenizers
+%doc *.md
+%license LICENSE
+%{python3_sitearch}/*
+
+%changelog
+* Sun Jan 28 2024 Binshuo Zu <274620705z@gmail.com> - 0.15.1-1
+- Package init
diff --git a/sources b/sources
new file mode 100644
index 0000000..205b997
--- /dev/null
+++ b/sources
@@ -0,0 +1 @@
+7cc76ef8345f55428279986b20e422ca v0.15.1.tar.gz