3 files changed, 671 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
index e69de29..f41b5fc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1 @@
+/asrp-0.0.74.tar.gz
diff --git a/python-asrp.spec b/python-asrp.spec
new file mode 100644
index 0000000..2fda34b
--- /dev/null
+++ b/python-asrp.spec
@@ -0,0 +1,669 @@
+%global _empty_manifest_terminate_build 0
+Name:		python-asrp
+Version:	0.0.74
+Release:	1
+Summary:	please add a summary manually as the author left a blank one
+License:	Apache
+URL:		https://github.com/voidful/asrp
+Source0:	https://mirrors.nju.edu.cn/pypi/web/packages/62/59/7929ee67d3e466c73931f22fef8640aa21332228b00ab8f237b2f0e004a2/asrp-0.0.74.tar.gz
+BuildArch:	noarch
+
+Requires:	python3-Unidecode
+Requires:	python3-jiwer
+Requires:	python3-transformers
+Requires:	python3-editdistance
+Requires:	python3-librosa
+Requires:	python3-webrtcvad
+Requires:	python3-pyctcdecode
+Requires:	python3-openai-whisper
+Requires:	python3-nlp2
+
+%description
+# ASRP: Automatic Speech Recognition Preprocessing Utility
+
+ASRP is a python package that offers a set of tools to preprocess and evaluate ASR (Automatic Speech Recognition) text.
+The package also provides a speech-to-text transcription tool and a text-to-speech conversion tool. The code is
+open-source and can be installed using pip.
+
+Key Features
+
+- [Preprocess ASR text with ease](#preprocess)
+- [Evaluate ASR output quality](#Evaluation)
+- [Transcribe speech to Hubert code](#speech-to-discrete-unit)
+- [Convert unit code to speech](#discrete-unit-to-speech)
+- [Enhance speech quality with a noise reduction tool](#speech-enhancement)
+- [LiveASR tool for real-time speech recognition](#liveasr---huggingfaces-model)
+- [Speaker Embedding Extraction (x-vector/d-vector)](#speaker-embedding-extraction---x-vector)
+
+## install
+
+`pip install asrp`
+
+## Preprocess
+
+ASRP offers an easy-to-use set of functions to preprocess ASR text data.   
+The input data is a dictionary with the key 'sentence', and the output is the preprocessed text.     
+You can either use the fun_en function or use dynamic loading. Here's how to use it:
+
+```python
+import asrp
+
+batch_data = {
+    'sentence': "I'm fine, thanks."
+}
+asrp.fun_en(batch_data)
+```
+
+dynamic loading
+
+```python
+import asrp
+
+batch_data = {
+    'sentence': "I'm fine, thanks."
+}
+preprocessor = getattr(asrp, 'fun_en')
+preprocessor(batch_data)
+```
+
+## Evaluation
+
+ASRP provides functions to evaluate the output quality of ASR systems using     
+the Word Error Rate (WER) and Character Error Rate (CER) metrics.   
+Here's how to use it:
+
+```python
+import asrp
+
+targets = ['HuggingFace is great!', 'Love Transformers!', 'Let\'s wav2vec!']
+preds = ['HuggingFace is awesome!', 'Transformers is powerful.', 'Let\'s finetune wav2vec!']
+print("chunk size WER: {:2f}".format(100 * asrp.chunked_wer(targets, preds, chunk_size=None)))
+print("chunk size CER: {:2f}".format(100 * asrp.chunked_cer(targets, preds, chunk_size=None)))
+```
+
+## Speech to Discrete Unit
+
+```python
+import asrp
+import nlp2
+
+# https://github.com/facebookresearch/fairseq/blob/ust/examples/speech_to_speech/docs/textless_s2st_real_data.md
+# https://github.com/facebookresearch/fairseq/tree/main/examples/textless_nlp/gslm/ulm
+nlp2.download_file(
+    'https://huggingface.co/voidful/mhubert-base/resolve/main/mhubert_base_vp_en_es_fr_it3_L11_km1000.bin', './')
+hc = asrp.HubertCode("voidful/mhubert-base", './mhubert_base_vp_en_es_fr_it3_L11_km1000.bin', 11,
+                     chunk_sec=30,
+                     worker=20)
+hc('voice file path')
+```
+
+## Discrete Unit to speech
+
+```python
+import asrp
+
+code = []  # discrete unit
+# https://github.com/pytorch/fairseq/tree/main/examples/textless_nlp/gslm/unit2speech
+# https://github.com/facebookresearch/fairseq/blob/ust/examples/speech_to_speech/docs/textless_s2st_real_data.md
+cs = asrp.Code2Speech(tts_checkpoint='./tts_checkpoint_best.pt', waveglow_checkpint='waveglow_256channels_new.pt')
+cs(code)
+
+# play on notebook
+import IPython.display as ipd
+
+ipd.Audio(data=cs(code), autoplay=False, rate=cs.sample_rate)
+```
+
+mhubert English hifigan vocoder example
+
+```python
+import asrp
+import nlp2
+import IPython.display as ipd
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+nlp2.download_file(
+    'https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj/g_00500000',
+    './')
+
+
+tokenizer = AutoTokenizer.from_pretrained("voidful/mhubert-unit-tts")
+model = AutoModelForSeq2SeqLM.from_pretrained("voidful/mhubert-unit-tts")
+model.eval()
+cs = asrp.Code2Speech(tts_checkpoint='./g_00500000', vocoder='hifigan')
+
+inputs = tokenizer(["The quick brown fox jumps over the lazy dog."], return_tensors="pt")
+code = tokenizer.batch_decode(model.generate(**inputs,max_length=1024))[0]
+code = [int(i) for i in code.replace("</s>","").replace("<s>","").split("v_tok_")[1:]]
+print(code)
+ipd.Audio(data=cs(code), autoplay=False, rate=cs.sample_rate)
+
+```
+
+## Speech Enhancement
+
+ASRP also provides a tool to enhance speech quality with a noise reduction tool.  
+from https://github.com/facebookresearch/fairseq/tree/main/examples/speech_synthesis/preprocessing/denoiser
+
+```python
+from asrp import SpeechEnhancer
+
+ase = SpeechEnhancer()
+print(ase('./test/xxx.wav'))
+```
+
+## LiveASR - huggingface's model
+
+* modify from https://github.com/oliverguhr/wav2vec2-live
+
+```python
+from asrp.live import LiveSpeech
+
+english_model = "voidful/wav2vec2-xlsr-multilingual-56"
+asr = LiveSpeech(english_model, device_name="default")
+asr.start()
+
+try:
+    while True:
+        text, sample_length, inference_time = asr.get_last_text()
+        print(f"{sample_length:.3f}s"
+              + f"\t{inference_time:.3f}s"
+              + f"\t{text}")
+
+except KeyboardInterrupt:
+    asr.stop()
+```
+
+## LiveASR - whisper's model
+
+```python
+from asrp.live import LiveSpeech
+
+whisper_model = "tiny"
+asr = LiveSpeech(whisper_model, vad_mode=2, language='zh')
+asr.start()
+last_text = ""
+while True:
+    asr_text = ""
+    try:
+        asr_text, sample_length, inference_time = asr.get_last_text()
+        if len(asr_text) > 0:
+            print(asr_text, sample_length, inference_time)
+    except KeyboardInterrupt:
+        asr.stop()
+        break
+
+```
+
+## Speaker Embedding Extraction - x vector
+
+from https://speechbrain.readthedocs.io/en/latest/API/speechbrain.lobes.models.Xvector.html
+
+```python
+from asrp.speaker_embedding import extract_x_vector
+
+extract_x_vector('./test/xxx.wav')
+```
+
+## Speaker Embedding Extraction - d vector
+
+from https://github.com/yistLin/dvector
+
+```python
+from asrp.speaker_embedding import extract_d_vector
+
+extract_d_vector('./test/xxx.wav')
+```
+
+
+
+
+%package -n python3-asrp
+Summary:	please add a summary manually as the author left a blank one
+Provides:	python-asrp
+BuildRequires:	python3-devel
+BuildRequires:	python3-setuptools
+BuildRequires:	python3-pip
+%description -n python3-asrp
+# ASRP: Automatic Speech Recognition Preprocessing Utility
+
+ASRP is a python package that offers a set of tools to preprocess and evaluate ASR (Automatic Speech Recognition) text.
+The package also provides a speech-to-text transcription tool and a text-to-speech conversion tool. The code is
+open-source and can be installed using pip.
+
+Key Features
+
+- [Preprocess ASR text with ease](#preprocess)
+- [Evaluate ASR output quality](#Evaluation)
+- [Transcribe speech to Hubert code](#speech-to-discrete-unit)
+- [Convert unit code to speech](#discrete-unit-to-speech)
+- [Enhance speech quality with a noise reduction tool](#speech-enhancement)
+- [LiveASR tool for real-time speech recognition](#liveasr---huggingfaces-model)
+- [Speaker Embedding Extraction (x-vector/d-vector)](#speaker-embedding-extraction---x-vector)
+
+## install
+
+`pip install asrp`
+
+## Preprocess
+
+ASRP offers an easy-to-use set of functions to preprocess ASR text data.   
+The input data is a dictionary with the key 'sentence', and the output is the preprocessed text.     
+You can either use the fun_en function or use dynamic loading. Here's how to use it:
+
+```python
+import asrp
+
+batch_data = {
+    'sentence': "I'm fine, thanks."
+}
+asrp.fun_en(batch_data)
+```
+
+dynamic loading
+
+```python
+import asrp
+
+batch_data = {
+    'sentence': "I'm fine, thanks."
+}
+preprocessor = getattr(asrp, 'fun_en')
+preprocessor(batch_data)
+```
+
+## Evaluation
+
+ASRP provides functions to evaluate the output quality of ASR systems using     
+the Word Error Rate (WER) and Character Error Rate (CER) metrics.   
+Here's how to use it:
+
+```python
+import asrp
+
+targets = ['HuggingFace is great!', 'Love Transformers!', 'Let\'s wav2vec!']
+preds = ['HuggingFace is awesome!', 'Transformers is powerful.', 'Let\'s finetune wav2vec!']
+print("chunk size WER: {:2f}".format(100 * asrp.chunked_wer(targets, preds, chunk_size=None)))
+print("chunk size CER: {:2f}".format(100 * asrp.chunked_cer(targets, preds, chunk_size=None)))
+```
+
+## Speech to Discrete Unit
+
+```python
+import asrp
+import nlp2
+
+# https://github.com/facebookresearch/fairseq/blob/ust/examples/speech_to_speech/docs/textless_s2st_real_data.md
+# https://github.com/facebookresearch/fairseq/tree/main/examples/textless_nlp/gslm/ulm
+nlp2.download_file(
+    'https://huggingface.co/voidful/mhubert-base/resolve/main/mhubert_base_vp_en_es_fr_it3_L11_km1000.bin', './')
+hc = asrp.HubertCode("voidful/mhubert-base", './mhubert_base_vp_en_es_fr_it3_L11_km1000.bin', 11,
+                     chunk_sec=30,
+                     worker=20)
+hc('voice file path')
+```
+
+## Discrete Unit to speech
+
+```python
+import asrp
+
+code = []  # discrete unit
+# https://github.com/pytorch/fairseq/tree/main/examples/textless_nlp/gslm/unit2speech
+# https://github.com/facebookresearch/fairseq/blob/ust/examples/speech_to_speech/docs/textless_s2st_real_data.md
+cs = asrp.Code2Speech(tts_checkpoint='./tts_checkpoint_best.pt', waveglow_checkpint='waveglow_256channels_new.pt')
+cs(code)
+
+# play on notebook
+import IPython.display as ipd
+
+ipd.Audio(data=cs(code), autoplay=False, rate=cs.sample_rate)
+```
+
+mhubert English hifigan vocoder example
+
+```python
+import asrp
+import nlp2
+import IPython.display as ipd
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+nlp2.download_file(
+    'https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj/g_00500000',
+    './')
+
+
+tokenizer = AutoTokenizer.from_pretrained("voidful/mhubert-unit-tts")
+model = AutoModelForSeq2SeqLM.from_pretrained("voidful/mhubert-unit-tts")
+model.eval()
+cs = asrp.Code2Speech(tts_checkpoint='./g_00500000', vocoder='hifigan')
+
+inputs = tokenizer(["The quick brown fox jumps over the lazy dog."], return_tensors="pt")
+code = tokenizer.batch_decode(model.generate(**inputs,max_length=1024))[0]
+code = [int(i) for i in code.replace("</s>","").replace("<s>","").split("v_tok_")[1:]]
+print(code)
+ipd.Audio(data=cs(code), autoplay=False, rate=cs.sample_rate)
+
+```
+
+## Speech Enhancement
+
+ASRP also provides a tool to enhance speech quality with a noise reduction tool.  
+from https://github.com/facebookresearch/fairseq/tree/main/examples/speech_synthesis/preprocessing/denoiser
+
+```python
+from asrp import SpeechEnhancer
+
+ase = SpeechEnhancer()
+print(ase('./test/xxx.wav'))
+```
+
+## LiveASR - huggingface's model
+
+* modify from https://github.com/oliverguhr/wav2vec2-live
+
+```python
+from asrp.live import LiveSpeech
+
+english_model = "voidful/wav2vec2-xlsr-multilingual-56"
+asr = LiveSpeech(english_model, device_name="default")
+asr.start()
+
+try:
+    while True:
+        text, sample_length, inference_time = asr.get_last_text()
+        print(f"{sample_length:.3f}s"
+              + f"\t{inference_time:.3f}s"
+              + f"\t{text}")
+
+except KeyboardInterrupt:
+    asr.stop()
+```
+
+## LiveASR - whisper's model
+
+```python
+from asrp.live import LiveSpeech
+
+whisper_model = "tiny"
+asr = LiveSpeech(whisper_model, vad_mode=2, language='zh')
+asr.start()
+last_text = ""
+while True:
+    asr_text = ""
+    try:
+        asr_text, sample_length, inference_time = asr.get_last_text()
+        if len(asr_text) > 0:
+            print(asr_text, sample_length, inference_time)
+    except KeyboardInterrupt:
+        asr.stop()
+        break
+
+```
+
+## Speaker Embedding Extraction - x vector
+
+from https://speechbrain.readthedocs.io/en/latest/API/speechbrain.lobes.models.Xvector.html
+
+```python
+from asrp.speaker_embedding import extract_x_vector
+
+extract_x_vector('./test/xxx.wav')
+```
+
+## Speaker Embedding Extraction - d vector
+
+from https://github.com/yistLin/dvector
+
+```python
+from asrp.speaker_embedding import extract_d_vector
+
+extract_d_vector('./test/xxx.wav')
+```
+
+
+
+
+%package help
+Summary:	Development documents and examples for asrp
+Provides:	python3-asrp-doc
+%description help
+# ASRP: Automatic Speech Recognition Preprocessing Utility
+
+ASRP is a python package that offers a set of tools to preprocess and evaluate ASR (Automatic Speech Recognition) text.
+The package also provides a speech-to-text transcription tool and a text-to-speech conversion tool. The code is
+open-source and can be installed using pip.
+
+Key Features
+
+- [Preprocess ASR text with ease](#preprocess)
+- [Evaluate ASR output quality](#Evaluation)
+- [Transcribe speech to Hubert code](#speech-to-discrete-unit)
+- [Convert unit code to speech](#discrete-unit-to-speech)
+- [Enhance speech quality with a noise reduction tool](#speech-enhancement)
+- [LiveASR tool for real-time speech recognition](#liveasr---huggingfaces-model)
+- [Speaker Embedding Extraction (x-vector/d-vector)](#speaker-embedding-extraction---x-vector)
+
+## install
+
+`pip install asrp`
+
+## Preprocess
+
+ASRP offers an easy-to-use set of functions to preprocess ASR text data.   
+The input data is a dictionary with the key 'sentence', and the output is the preprocessed text.     
+You can either use the fun_en function or use dynamic loading. Here's how to use it:
+
+```python
+import asrp
+
+batch_data = {
+    'sentence': "I'm fine, thanks."
+}
+asrp.fun_en(batch_data)
+```
+
+dynamic loading
+
+```python
+import asrp
+
+batch_data = {
+    'sentence': "I'm fine, thanks."
+}
+preprocessor = getattr(asrp, 'fun_en')
+preprocessor(batch_data)
+```
+
+## Evaluation
+
+ASRP provides functions to evaluate the output quality of ASR systems using     
+the Word Error Rate (WER) and Character Error Rate (CER) metrics.   
+Here's how to use it:
+
+```python
+import asrp
+
+targets = ['HuggingFace is great!', 'Love Transformers!', 'Let\'s wav2vec!']
+preds = ['HuggingFace is awesome!', 'Transformers is powerful.', 'Let\'s finetune wav2vec!']
+print("chunk size WER: {:2f}".format(100 * asrp.chunked_wer(targets, preds, chunk_size=None)))
+print("chunk size CER: {:2f}".format(100 * asrp.chunked_cer(targets, preds, chunk_size=None)))
+```
+
+## Speech to Discrete Unit
+
+```python
+import asrp
+import nlp2
+
+# https://github.com/facebookresearch/fairseq/blob/ust/examples/speech_to_speech/docs/textless_s2st_real_data.md
+# https://github.com/facebookresearch/fairseq/tree/main/examples/textless_nlp/gslm/ulm
+nlp2.download_file(
+    'https://huggingface.co/voidful/mhubert-base/resolve/main/mhubert_base_vp_en_es_fr_it3_L11_km1000.bin', './')
+hc = asrp.HubertCode("voidful/mhubert-base", './mhubert_base_vp_en_es_fr_it3_L11_km1000.bin', 11,
+                     chunk_sec=30,
+                     worker=20)
+hc('voice file path')
+```
+
+## Discrete Unit to speech
+
+```python
+import asrp
+
+code = []  # discrete unit
+# https://github.com/pytorch/fairseq/tree/main/examples/textless_nlp/gslm/unit2speech
+# https://github.com/facebookresearch/fairseq/blob/ust/examples/speech_to_speech/docs/textless_s2st_real_data.md
+cs = asrp.Code2Speech(tts_checkpoint='./tts_checkpoint_best.pt', waveglow_checkpint='waveglow_256channels_new.pt')
+cs(code)
+
+# play on notebook
+import IPython.display as ipd
+
+ipd.Audio(data=cs(code), autoplay=False, rate=cs.sample_rate)
+```
+
+mhubert English hifigan vocoder example
+
+```python
+import asrp
+import nlp2
+import IPython.display as ipd
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+nlp2.download_file(
+    'https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj/g_00500000',
+    './')
+
+
+tokenizer = AutoTokenizer.from_pretrained("voidful/mhubert-unit-tts")
+model = AutoModelForSeq2SeqLM.from_pretrained("voidful/mhubert-unit-tts")
+model.eval()
+cs = asrp.Code2Speech(tts_checkpoint='./g_00500000', vocoder='hifigan')
+
+inputs = tokenizer(["The quick brown fox jumps over the lazy dog."], return_tensors="pt")
+code = tokenizer.batch_decode(model.generate(**inputs,max_length=1024))[0]
+code = [int(i) for i in code.replace("</s>","").replace("<s>","").split("v_tok_")[1:]]
+print(code)
+ipd.Audio(data=cs(code), autoplay=False, rate=cs.sample_rate)
+
+```
+
+## Speech Enhancement
+
+ASRP also provides a tool to enhance speech quality with a noise reduction tool.  
+from https://github.com/facebookresearch/fairseq/tree/main/examples/speech_synthesis/preprocessing/denoiser
+
+```python
+from asrp import SpeechEnhancer
+
+ase = SpeechEnhancer()
+print(ase('./test/xxx.wav'))
+```
+
+## LiveASR - huggingface's model
+
+* modify from https://github.com/oliverguhr/wav2vec2-live
+
+```python
+from asrp.live import LiveSpeech
+
+english_model = "voidful/wav2vec2-xlsr-multilingual-56"
+asr = LiveSpeech(english_model, device_name="default")
+asr.start()
+
+try:
+    while True:
+        text, sample_length, inference_time = asr.get_last_text()
+        print(f"{sample_length:.3f}s"
+              + f"\t{inference_time:.3f}s"
+              + f"\t{text}")
+
+except KeyboardInterrupt:
+    asr.stop()
+```
+
+## LiveASR - whisper's model
+
+```python
+from asrp.live import LiveSpeech
+
+whisper_model = "tiny"
+asr = LiveSpeech(whisper_model, vad_mode=2, language='zh')
+asr.start()
+last_text = ""
+while True:
+    asr_text = ""
+    try:
+        asr_text, sample_length, inference_time = asr.get_last_text()
+        if len(asr_text) > 0:
+            print(asr_text, sample_length, inference_time)
+    except KeyboardInterrupt:
+        asr.stop()
+        break
+
+```
+
+## Speaker Embedding Extraction - x vector
+
+from https://speechbrain.readthedocs.io/en/latest/API/speechbrain.lobes.models.Xvector.html
+
+```python
+from asrp.speaker_embedding import extract_x_vector
+
+extract_x_vector('./test/xxx.wav')
+```
+
+## Speaker Embedding Extraction - d vector
+
+from https://github.com/yistLin/dvector
+
+```python
+from asrp.speaker_embedding import extract_d_vector
+
+extract_d_vector('./test/xxx.wav')
+```
+
+
+
+
+%prep
+%autosetup -n asrp-0.0.74
+
+%build
+%py3_build
+
+%install
+%py3_install
+install -d -m755 %{buildroot}/%{_pkgdocdir}
+if [ -d doc ]; then cp -arf doc %{buildroot}/%{_pkgdocdir}; fi
+if [ -d docs ]; then cp -arf docs %{buildroot}/%{_pkgdocdir}; fi
+if [ -d example ]; then cp -arf example %{buildroot}/%{_pkgdocdir}; fi
+if [ -d examples ]; then cp -arf examples %{buildroot}/%{_pkgdocdir}; fi
+pushd %{buildroot}
+if [ -d usr/lib ]; then
+	find usr/lib -type f -printf "/%h/%f\n" >> filelist.lst
+fi
+if [ -d usr/lib64 ]; then
+	find usr/lib64 -type f -printf "/%h/%f\n" >> filelist.lst
+fi
+if [ -d usr/bin ]; then
+	find usr/bin -type f -printf "/%h/%f\n" >> filelist.lst
+fi
+if [ -d usr/sbin ]; then
+	find usr/sbin -type f -printf "/%h/%f\n" >> filelist.lst
+fi
+touch doclist.lst
+if [ -d usr/share/man ]; then
+	find usr/share/man -type f -printf "/%h/%f.gz\n" >> doclist.lst
+fi
+popd
+mv %{buildroot}/filelist.lst .
+mv %{buildroot}/doclist.lst .
+
+%files -n python3-asrp -f filelist.lst
+%dir %{python3_sitelib}/*
+
+%files help -f doclist.lst
+%{_docdir}/*
+
+%changelog
+* Wed May 17 2023 Python_Bot <Python_Bot@openeuler.org> - 0.0.74-1
+- Package Spec generated
diff --git a/sources b/sources
new file mode 100644
index 0000000..031d8e4
--- /dev/null
+++ b/sources
@@ -0,0 +1 @@
+e26c984561f22394c352db77ee2b604d  asrp-0.0.74.tar.gz