diff options
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | python-asrp.spec | 669 | ||||
-rw-r--r-- | sources | 1 |
3 files changed, 671 insertions, 0 deletions
@@ -0,0 +1 @@ +/asrp-0.0.74.tar.gz diff --git a/python-asrp.spec b/python-asrp.spec new file mode 100644 index 0000000..2fda34b --- /dev/null +++ b/python-asrp.spec @@ -0,0 +1,669 @@ +%global _empty_manifest_terminate_build 0 +Name: python-asrp +Version: 0.0.74 +Release: 1 +Summary: please add a summary manually as the author left a blank one +License: Apache +URL: https://github.com/voidful/asrp +Source0: https://mirrors.nju.edu.cn/pypi/web/packages/62/59/7929ee67d3e466c73931f22fef8640aa21332228b00ab8f237b2f0e004a2/asrp-0.0.74.tar.gz +BuildArch: noarch + +Requires: python3-Unidecode +Requires: python3-jiwer +Requires: python3-transformers +Requires: python3-editdistance +Requires: python3-librosa +Requires: python3-webrtcvad +Requires: python3-pyctcdecode +Requires: python3-openai-whisper +Requires: python3-nlp2 + +%description +# ASRP: Automatic Speech Recognition Preprocessing Utility + +ASRP is a python package that offers a set of tools to preprocess and evaluate ASR (Automatic Speech Recognition) text. +The package also provides a speech-to-text transcription tool and a text-to-speech conversion tool. The code is +open-source and can be installed using pip. + +Key Features + +- [Preprocess ASR text with ease](#preprocess) +- [Evaluate ASR output quality](#Evaluation) +- [Transcribe speech to Hubert code](#speech-to-discrete-unit) +- [Convert unit code to speech](#discrete-unit-to-speech) +- [Enhance speech quality with a noise reduction tool](#speech-enhancement) +- [LiveASR tool for real-time speech recognition](#liveasr---huggingfaces-model) +- [Speaker Embedding Extraction (x-vector/d-vector)](#speaker-embedding-extraction---x-vector) + +## install + +`pip install asrp` + +## Preprocess + +ASRP offers an easy-to-use set of functions to preprocess ASR text data. +The input data is a dictionary with the key 'sentence', and the output is the preprocessed text. +You can either use the fun_en function or use dynamic loading. Here's how to use it: + +```python +import asrp + +batch_data = { + 'sentence': "I'm fine, thanks." +} +asrp.fun_en(batch_data) +``` + +dynamic loading + +```python +import asrp + +batch_data = { + 'sentence': "I'm fine, thanks." +} +preprocessor = getattr(asrp, 'fun_en') +preprocessor(batch_data) +``` + +## Evaluation + +ASRP provides functions to evaluate the output quality of ASR systems using +the Word Error Rate (WER) and Character Error Rate (CER) metrics. +Here's how to use it: + +```python +import asrp + +targets = ['HuggingFace is great!', 'Love Transformers!', 'Let\'s wav2vec!'] +preds = ['HuggingFace is awesome!', 'Transformers is powerful.', 'Let\'s finetune wav2vec!'] +print("chunk size WER: {:2f}".format(100 * asrp.chunked_wer(targets, preds, chunk_size=None))) +print("chunk size CER: {:2f}".format(100 * asrp.chunked_cer(targets, preds, chunk_size=None))) +``` + +## Speech to Discrete Unit + +```python +import asrp +import nlp2 + +# https://github.com/facebookresearch/fairseq/blob/ust/examples/speech_to_speech/docs/textless_s2st_real_data.md +# https://github.com/facebookresearch/fairseq/tree/main/examples/textless_nlp/gslm/ulm +nlp2.download_file( + 'https://huggingface.co/voidful/mhubert-base/resolve/main/mhubert_base_vp_en_es_fr_it3_L11_km1000.bin', './') +hc = asrp.HubertCode("voidful/mhubert-base", './mhubert_base_vp_en_es_fr_it3_L11_km1000.bin', 11, + chunk_sec=30, + worker=20) +hc('voice file path') +``` + +## Discrete Unit to speech + +```python +import asrp + +code = [] # discrete unit +# https://github.com/pytorch/fairseq/tree/main/examples/textless_nlp/gslm/unit2speech +# https://github.com/facebookresearch/fairseq/blob/ust/examples/speech_to_speech/docs/textless_s2st_real_data.md +cs = asrp.Code2Speech(tts_checkpoint='./tts_checkpoint_best.pt', waveglow_checkpint='waveglow_256channels_new.pt') +cs(code) + +# play on notebook +import IPython.display as ipd + +ipd.Audio(data=cs(code), autoplay=False, rate=cs.sample_rate) +``` + +mhubert English hifigan vocoder example + +```python +import asrp +import nlp2 +import IPython.display as ipd +from transformers import AutoTokenizer, AutoModelForSeq2SeqLM +nlp2.download_file( + 'https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj/g_00500000', + './') + + +tokenizer = AutoTokenizer.from_pretrained("voidful/mhubert-unit-tts") +model = AutoModelForSeq2SeqLM.from_pretrained("voidful/mhubert-unit-tts") +model.eval() +cs = asrp.Code2Speech(tts_checkpoint='./g_00500000', vocoder='hifigan') + +inputs = tokenizer(["The quick brown fox jumps over the lazy dog."], return_tensors="pt") +code = tokenizer.batch_decode(model.generate(**inputs,max_length=1024))[0] +code = [int(i) for i in code.replace("</s>","").replace("<s>","").split("v_tok_")[1:]] +print(code) +ipd.Audio(data=cs(code), autoplay=False, rate=cs.sample_rate) + +``` + +## Speech Enhancement + +ASRP also provides a tool to enhance speech quality with a noise reduction tool. +from https://github.com/facebookresearch/fairseq/tree/main/examples/speech_synthesis/preprocessing/denoiser + +```python +from asrp import SpeechEnhancer + +ase = SpeechEnhancer() +print(ase('./test/xxx.wav')) +``` + +## LiveASR - huggingface's model + +* modify from https://github.com/oliverguhr/wav2vec2-live + +```python +from asrp.live import LiveSpeech + +english_model = "voidful/wav2vec2-xlsr-multilingual-56" +asr = LiveSpeech(english_model, device_name="default") +asr.start() + +try: + while True: + text, sample_length, inference_time = asr.get_last_text() + print(f"{sample_length:.3f}s" + + f"\t{inference_time:.3f}s" + + f"\t{text}") + +except KeyboardInterrupt: + asr.stop() +``` + +## LiveASR - whisper's model + +```python +from asrp.live import LiveSpeech + +whisper_model = "tiny" +asr = LiveSpeech(whisper_model, vad_mode=2, language='zh') +asr.start() +last_text = "" +while True: + asr_text = "" + try: + asr_text, sample_length, inference_time = asr.get_last_text() + if len(asr_text) > 0: + print(asr_text, sample_length, inference_time) + except KeyboardInterrupt: + asr.stop() + break + +``` + +## Speaker Embedding Extraction - x vector + +from https://speechbrain.readthedocs.io/en/latest/API/speechbrain.lobes.models.Xvector.html + +```python +from asrp.speaker_embedding import extract_x_vector + +extract_x_vector('./test/xxx.wav') +``` + +## Speaker Embedding Extraction - d vector + +from https://github.com/yistLin/dvector + +```python +from asrp.speaker_embedding import extract_d_vector + +extract_d_vector('./test/xxx.wav') +``` + + + + +%package -n python3-asrp +Summary: please add a summary manually as the author left a blank one +Provides: python-asrp +BuildRequires: python3-devel +BuildRequires: python3-setuptools +BuildRequires: python3-pip +%description -n python3-asrp +# ASRP: Automatic Speech Recognition Preprocessing Utility + +ASRP is a python package that offers a set of tools to preprocess and evaluate ASR (Automatic Speech Recognition) text. +The package also provides a speech-to-text transcription tool and a text-to-speech conversion tool. The code is +open-source and can be installed using pip. + +Key Features + +- [Preprocess ASR text with ease](#preprocess) +- [Evaluate ASR output quality](#Evaluation) +- [Transcribe speech to Hubert code](#speech-to-discrete-unit) +- [Convert unit code to speech](#discrete-unit-to-speech) +- [Enhance speech quality with a noise reduction tool](#speech-enhancement) +- [LiveASR tool for real-time speech recognition](#liveasr---huggingfaces-model) +- [Speaker Embedding Extraction (x-vector/d-vector)](#speaker-embedding-extraction---x-vector) + +## install + +`pip install asrp` + +## Preprocess + +ASRP offers an easy-to-use set of functions to preprocess ASR text data. +The input data is a dictionary with the key 'sentence', and the output is the preprocessed text. +You can either use the fun_en function or use dynamic loading. Here's how to use it: + +```python +import asrp + +batch_data = { + 'sentence': "I'm fine, thanks." +} +asrp.fun_en(batch_data) +``` + +dynamic loading + +```python +import asrp + +batch_data = { + 'sentence': "I'm fine, thanks." +} +preprocessor = getattr(asrp, 'fun_en') +preprocessor(batch_data) +``` + +## Evaluation + +ASRP provides functions to evaluate the output quality of ASR systems using +the Word Error Rate (WER) and Character Error Rate (CER) metrics. +Here's how to use it: + +```python +import asrp + +targets = ['HuggingFace is great!', 'Love Transformers!', 'Let\'s wav2vec!'] +preds = ['HuggingFace is awesome!', 'Transformers is powerful.', 'Let\'s finetune wav2vec!'] +print("chunk size WER: {:2f}".format(100 * asrp.chunked_wer(targets, preds, chunk_size=None))) +print("chunk size CER: {:2f}".format(100 * asrp.chunked_cer(targets, preds, chunk_size=None))) +``` + +## Speech to Discrete Unit + +```python +import asrp +import nlp2 + +# https://github.com/facebookresearch/fairseq/blob/ust/examples/speech_to_speech/docs/textless_s2st_real_data.md +# https://github.com/facebookresearch/fairseq/tree/main/examples/textless_nlp/gslm/ulm +nlp2.download_file( + 'https://huggingface.co/voidful/mhubert-base/resolve/main/mhubert_base_vp_en_es_fr_it3_L11_km1000.bin', './') +hc = asrp.HubertCode("voidful/mhubert-base", './mhubert_base_vp_en_es_fr_it3_L11_km1000.bin', 11, + chunk_sec=30, + worker=20) +hc('voice file path') +``` + +## Discrete Unit to speech + +```python +import asrp + +code = [] # discrete unit +# https://github.com/pytorch/fairseq/tree/main/examples/textless_nlp/gslm/unit2speech +# https://github.com/facebookresearch/fairseq/blob/ust/examples/speech_to_speech/docs/textless_s2st_real_data.md +cs = asrp.Code2Speech(tts_checkpoint='./tts_checkpoint_best.pt', waveglow_checkpint='waveglow_256channels_new.pt') +cs(code) + +# play on notebook +import IPython.display as ipd + +ipd.Audio(data=cs(code), autoplay=False, rate=cs.sample_rate) +``` + +mhubert English hifigan vocoder example + +```python +import asrp +import nlp2 +import IPython.display as ipd +from transformers import AutoTokenizer, AutoModelForSeq2SeqLM +nlp2.download_file( + 'https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj/g_00500000', + './') + + +tokenizer = AutoTokenizer.from_pretrained("voidful/mhubert-unit-tts") +model = AutoModelForSeq2SeqLM.from_pretrained("voidful/mhubert-unit-tts") +model.eval() +cs = asrp.Code2Speech(tts_checkpoint='./g_00500000', vocoder='hifigan') + +inputs = tokenizer(["The quick brown fox jumps over the lazy dog."], return_tensors="pt") +code = tokenizer.batch_decode(model.generate(**inputs,max_length=1024))[0] +code = [int(i) for i in code.replace("</s>","").replace("<s>","").split("v_tok_")[1:]] +print(code) +ipd.Audio(data=cs(code), autoplay=False, rate=cs.sample_rate) + +``` + +## Speech Enhancement + +ASRP also provides a tool to enhance speech quality with a noise reduction tool. +from https://github.com/facebookresearch/fairseq/tree/main/examples/speech_synthesis/preprocessing/denoiser + +```python +from asrp import SpeechEnhancer + +ase = SpeechEnhancer() +print(ase('./test/xxx.wav')) +``` + +## LiveASR - huggingface's model + +* modify from https://github.com/oliverguhr/wav2vec2-live + +```python +from asrp.live import LiveSpeech + +english_model = "voidful/wav2vec2-xlsr-multilingual-56" +asr = LiveSpeech(english_model, device_name="default") +asr.start() + +try: + while True: + text, sample_length, inference_time = asr.get_last_text() + print(f"{sample_length:.3f}s" + + f"\t{inference_time:.3f}s" + + f"\t{text}") + +except KeyboardInterrupt: + asr.stop() +``` + +## LiveASR - whisper's model + +```python +from asrp.live import LiveSpeech + +whisper_model = "tiny" +asr = LiveSpeech(whisper_model, vad_mode=2, language='zh') +asr.start() +last_text = "" +while True: + asr_text = "" + try: + asr_text, sample_length, inference_time = asr.get_last_text() + if len(asr_text) > 0: + print(asr_text, sample_length, inference_time) + except KeyboardInterrupt: + asr.stop() + break + +``` + +## Speaker Embedding Extraction - x vector + +from https://speechbrain.readthedocs.io/en/latest/API/speechbrain.lobes.models.Xvector.html + +```python +from asrp.speaker_embedding import extract_x_vector + +extract_x_vector('./test/xxx.wav') +``` + +## Speaker Embedding Extraction - d vector + +from https://github.com/yistLin/dvector + +```python +from asrp.speaker_embedding import extract_d_vector + +extract_d_vector('./test/xxx.wav') +``` + + + + +%package help +Summary: Development documents and examples for asrp +Provides: python3-asrp-doc +%description help +# ASRP: Automatic Speech Recognition Preprocessing Utility + +ASRP is a python package that offers a set of tools to preprocess and evaluate ASR (Automatic Speech Recognition) text. +The package also provides a speech-to-text transcription tool and a text-to-speech conversion tool. The code is +open-source and can be installed using pip. + +Key Features + +- [Preprocess ASR text with ease](#preprocess) +- [Evaluate ASR output quality](#Evaluation) +- [Transcribe speech to Hubert code](#speech-to-discrete-unit) +- [Convert unit code to speech](#discrete-unit-to-speech) +- [Enhance speech quality with a noise reduction tool](#speech-enhancement) +- [LiveASR tool for real-time speech recognition](#liveasr---huggingfaces-model) +- [Speaker Embedding Extraction (x-vector/d-vector)](#speaker-embedding-extraction---x-vector) + +## install + +`pip install asrp` + +## Preprocess + +ASRP offers an easy-to-use set of functions to preprocess ASR text data. +The input data is a dictionary with the key 'sentence', and the output is the preprocessed text. +You can either use the fun_en function or use dynamic loading. Here's how to use it: + +```python +import asrp + +batch_data = { + 'sentence': "I'm fine, thanks." +} +asrp.fun_en(batch_data) +``` + +dynamic loading + +```python +import asrp + +batch_data = { + 'sentence': "I'm fine, thanks." +} +preprocessor = getattr(asrp, 'fun_en') +preprocessor(batch_data) +``` + +## Evaluation + +ASRP provides functions to evaluate the output quality of ASR systems using +the Word Error Rate (WER) and Character Error Rate (CER) metrics. +Here's how to use it: + +```python +import asrp + +targets = ['HuggingFace is great!', 'Love Transformers!', 'Let\'s wav2vec!'] +preds = ['HuggingFace is awesome!', 'Transformers is powerful.', 'Let\'s finetune wav2vec!'] +print("chunk size WER: {:2f}".format(100 * asrp.chunked_wer(targets, preds, chunk_size=None))) +print("chunk size CER: {:2f}".format(100 * asrp.chunked_cer(targets, preds, chunk_size=None))) +``` + +## Speech to Discrete Unit + +```python +import asrp +import nlp2 + +# https://github.com/facebookresearch/fairseq/blob/ust/examples/speech_to_speech/docs/textless_s2st_real_data.md +# https://github.com/facebookresearch/fairseq/tree/main/examples/textless_nlp/gslm/ulm +nlp2.download_file( + 'https://huggingface.co/voidful/mhubert-base/resolve/main/mhubert_base_vp_en_es_fr_it3_L11_km1000.bin', './') +hc = asrp.HubertCode("voidful/mhubert-base", './mhubert_base_vp_en_es_fr_it3_L11_km1000.bin', 11, + chunk_sec=30, + worker=20) +hc('voice file path') +``` + +## Discrete Unit to speech + +```python +import asrp + +code = [] # discrete unit +# https://github.com/pytorch/fairseq/tree/main/examples/textless_nlp/gslm/unit2speech +# https://github.com/facebookresearch/fairseq/blob/ust/examples/speech_to_speech/docs/textless_s2st_real_data.md +cs = asrp.Code2Speech(tts_checkpoint='./tts_checkpoint_best.pt', waveglow_checkpint='waveglow_256channels_new.pt') +cs(code) + +# play on notebook +import IPython.display as ipd + +ipd.Audio(data=cs(code), autoplay=False, rate=cs.sample_rate) +``` + +mhubert English hifigan vocoder example + +```python +import asrp +import nlp2 +import IPython.display as ipd +from transformers import AutoTokenizer, AutoModelForSeq2SeqLM +nlp2.download_file( + 'https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj/g_00500000', + './') + + +tokenizer = AutoTokenizer.from_pretrained("voidful/mhubert-unit-tts") +model = AutoModelForSeq2SeqLM.from_pretrained("voidful/mhubert-unit-tts") +model.eval() +cs = asrp.Code2Speech(tts_checkpoint='./g_00500000', vocoder='hifigan') + +inputs = tokenizer(["The quick brown fox jumps over the lazy dog."], return_tensors="pt") +code = tokenizer.batch_decode(model.generate(**inputs,max_length=1024))[0] +code = [int(i) for i in code.replace("</s>","").replace("<s>","").split("v_tok_")[1:]] +print(code) +ipd.Audio(data=cs(code), autoplay=False, rate=cs.sample_rate) + +``` + +## Speech Enhancement + +ASRP also provides a tool to enhance speech quality with a noise reduction tool. +from https://github.com/facebookresearch/fairseq/tree/main/examples/speech_synthesis/preprocessing/denoiser + +```python +from asrp import SpeechEnhancer + +ase = SpeechEnhancer() +print(ase('./test/xxx.wav')) +``` + +## LiveASR - huggingface's model + +* modify from https://github.com/oliverguhr/wav2vec2-live + +```python +from asrp.live import LiveSpeech + +english_model = "voidful/wav2vec2-xlsr-multilingual-56" +asr = LiveSpeech(english_model, device_name="default") +asr.start() + +try: + while True: + text, sample_length, inference_time = asr.get_last_text() + print(f"{sample_length:.3f}s" + + f"\t{inference_time:.3f}s" + + f"\t{text}") + +except KeyboardInterrupt: + asr.stop() +``` + +## LiveASR - whisper's model + +```python +from asrp.live import LiveSpeech + +whisper_model = "tiny" +asr = LiveSpeech(whisper_model, vad_mode=2, language='zh') +asr.start() +last_text = "" +while True: + asr_text = "" + try: + asr_text, sample_length, inference_time = asr.get_last_text() + if len(asr_text) > 0: + print(asr_text, sample_length, inference_time) + except KeyboardInterrupt: + asr.stop() + break + +``` + +## Speaker Embedding Extraction - x vector + +from https://speechbrain.readthedocs.io/en/latest/API/speechbrain.lobes.models.Xvector.html + +```python +from asrp.speaker_embedding import extract_x_vector + +extract_x_vector('./test/xxx.wav') +``` + +## Speaker Embedding Extraction - d vector + +from https://github.com/yistLin/dvector + +```python +from asrp.speaker_embedding import extract_d_vector + +extract_d_vector('./test/xxx.wav') +``` + + + + +%prep +%autosetup -n asrp-0.0.74 + +%build +%py3_build + +%install +%py3_install +install -d -m755 %{buildroot}/%{_pkgdocdir} +if [ -d doc ]; then cp -arf doc %{buildroot}/%{_pkgdocdir}; fi +if [ -d docs ]; then cp -arf docs %{buildroot}/%{_pkgdocdir}; fi +if [ -d example ]; then cp -arf example %{buildroot}/%{_pkgdocdir}; fi +if [ -d examples ]; then cp -arf examples %{buildroot}/%{_pkgdocdir}; fi +pushd %{buildroot} +if [ -d usr/lib ]; then + find usr/lib -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/lib64 ]; then + find usr/lib64 -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/bin ]; then + find usr/bin -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/sbin ]; then + find usr/sbin -type f -printf "/%h/%f\n" >> filelist.lst +fi +touch doclist.lst +if [ -d usr/share/man ]; then + find usr/share/man -type f -printf "/%h/%f.gz\n" >> doclist.lst +fi +popd +mv %{buildroot}/filelist.lst . +mv %{buildroot}/doclist.lst . + +%files -n python3-asrp -f filelist.lst +%dir %{python3_sitelib}/* + +%files help -f doclist.lst +%{_docdir}/* + +%changelog +* Wed May 17 2023 Python_Bot <Python_Bot@openeuler.org> - 0.0.74-1 +- Package Spec generated @@ -0,0 +1 @@ +e26c984561f22394c352db77ee2b604d asrp-0.0.74.tar.gz |