From ea3fe7e41c75c309341ed0a282530063adf9bcd8 Mon Sep 17 00:00:00 2001 From: CoprDistGit Date: Mon, 29 May 2023 10:55:43 +0000 Subject: automatic import of python-takeblipner --- .gitignore | 1 + python-takeblipner.spec | 1115 +++++++++++++++++++++++++++++++++++++++++++++++ sources | 1 + 3 files changed, 1117 insertions(+) create mode 100644 python-takeblipner.spec create mode 100644 sources diff --git a/.gitignore b/.gitignore index e69de29..4363144 100644 --- a/.gitignore +++ b/.gitignore @@ -0,0 +1 @@ +/TakeBlipNer-0.0.6.tar.gz diff --git a/python-takeblipner.spec b/python-takeblipner.spec new file mode 100644 index 0000000..d5f4e1a --- /dev/null +++ b/python-takeblipner.spec @@ -0,0 +1,1115 @@ +%global _empty_manifest_terminate_build 0 +Name: python-TakeBlipNer +Version: 0.0.6 +Release: 1 +Summary: Named Entity Recognition Package +License: MIT License +URL: https://pypi.org/project/TakeBlipNer/ +Source0: https://mirrors.nju.edu.cn/pypi/web/packages/be/6b/72d964acb48f4bb01d994c493d5f15ae805c78f4fa8629580a6d2974344e/TakeBlipNer-0.0.6.tar.gz +BuildArch: noarch + +Requires: python3-pyaap +Requires: python3-tqdm +Requires: python3-gensim +Requires: python3-TakeSentenceTokenizer +Requires: python3-tensorboard + +%description +# TakeBlipNer Package +_Data & Analytics Research_ + +## Overview + +NER (Named Entity Recognition) is an NLP problem that aims to locate and classify entities in a text. +This implementation uses BiLSTM-CRF for solving the NER task utilizing PyTorch framework for training a supervised model and predicting in CPU. +For training, it receives a pre-trained FastText Gensim embedding, a PosTagging Model and a .csv file. It outputs three pickle files: model, word vocabulary and label vocabulary. +Example of classes that can be trained: + +- Financial [FIN] +- Generic [GEN] +- Company [COMP] +- Number [NUMBER] +- Document [DOC] +- Location [LOC] +- Person [PERS] +- Phone [PHONE] +- Address [ADDR] +- Email [EMAIL] +- Date [DATE] +- Week Day [WD] +- Money [MONEY] +- Relatives [REL] +- Vocatives [VOC] + +Some additional information is used to identify where the recognized entity begins and ends. + +- The letter B indicates the beginning of the CLASS class entity; +- The letter I indicates that the respective token is a continuation of the class with the name CLASS started; +- The letter O indicates that no entity related to the token was found; + +For example, the sentence "ligar internet a cabo!" would be classified as "O O B-GEN I-GEN I-GEN", +where B-GEN represents the beginning of the GEN entity (token "internet") and the next two tokens are +the continuation of the entity (tokens "a cabo"). The entity found in the sentence would be +"internet a cabo" of the GEN class. + +Here are presented these content: + +## Training NER Model +To train you own NER model using this package, some steps should +be made before: +1) Import main packages +2) Initialize file variables: embedding, postagging, train and validation .csv files; +3) Initialize NER parameters; +4) Instantiate vocabulary and label vocabulary objects; +5) Save vocabulary and label models; +6) Read PosTagging pickle file, embedding and initializing PosTagging object; +7) Initialize BiLSTM-CRF model and set embedding; +8) Initialize LSTMCRFTrainer object; +9) Train the model. + +An example of the above steps could be found in the python code below: + +1) Import main packages: + ``` +import torch + +from TakeBlipNer import utils, vocab, nermodel +from TakeBlipNer.train import LSTMCRFTrainer +from TakeBlipPosTagger.predict import PosTaggerPredict + ``` +2) Initialize file variables: +``` +wordembed_path = '*.kv' +save_dir = '*' +input_path = '*.csv' +val_path = '*.csv' +postag_model_path = '*.pkl' +postag_label_path = '*.pkl' +``` +3) Initialize NER parameters + +In order to train a model, the following variables should be created: + +- **sentence_column**: String with sentence column name in train file +- **unknown_string**: String which represents unknown token; +- **padding_string**: String which represents the pad token; +- **batch_size**: Number of sentences Number of samples that will be propagated through the network; +- **shuffle**: Boolean representing whether the dataset is shuffled. +- **use_pre_processing**: Boolean indicating whether the sentence will be preprocessed +- **separator**: String with file separator (for batch prediction); +- **encoding**: String with the encoding used in sentence; +- **save_dir**: String with directory to save outputs (checkpoints, vocabs, etc.); +- **device**: String where train will occur (cpu or gpu); +- **word_dim**: Integer with dimensions of word embeddings; +- **lstm_dim**: Integer with dimensions of lstm cells. This determines the hidden state and cell state sizes; +- **lstm_layers**: Integer with layers of lstm cells; +- **dropout_prob**: Float with probability in dropout layers; +- **bidirectional**: Boolean whether lstm cells are bidirectional; +- **alpha**: Float representing L2 penalization parameter; +- **epochs**: Integer with number of training epochs; +- **ckpt_period**: Period to wait until a model checkpoint is saved to the disk. Periods are specified by an integer and a unit ("e": 'epoch, "i": iteration, "s": global step); +- **val**: Integer whether to perform validation; +- **val_period**: Period to wait until a validation is performed. Periods are specified by an integer and a unit ("e": epoch, "i": iteration, "s": global step); +- **samples**: Integer with number of output samples to display at each validation; +- **learning_rate**: Float representing learning rate parameter; +- **learning_rate_decay**: Float representing learning rate decay parameter; +- **max_patience**: Integer with max patience parameter; +- **max_decay_num**: Integer with max decay parameter; +- **patience_threshold**: Float representing threshold of loss for patience count. + + +Example of parameters creation: +``` +sentence_column = 'Message' +label_column = 'Tags' +unknown_string = '' +padding_string = '' +batch_size = 64 +shuffle = False +use_pre_processing = True +separator = '|' +encoding = 'utf-8' +device = 'cpu' +word_dim = 300 +lstm_dim = 300 +lstm_layers = 1 +dropout_prob = 0.05 +bidirectional = False +alpha = 0.5 +epochs = 10 +ckpt_period = '1e' +val = True +val_period = '1e' +samples = 10 +learning_rate = 0.001 +learning_rate_decay = 0.01 +max_patience = 5 +max_decay_num = 5 +patience_threshold = 0.98 +device = 'cpu' +``` +4) Instantiate vocabulary and label vocabulary objects: +``` +input_vocab = vocab.create_vocabulary( + input_path=input_path, + column_name=sentence_column, + pad_string=padding_string, + unk_string=unknown_string, + encoding=encoding, + separator=separator, + use_pre_processing=use_pre_processing) + +label_vocab = vocab.create_vocabulary( + input_path = input_path, + column_name =label_column, + pad_string = padding_string, + unk_string = unknown_string, + encoding = encoding, + separator = separator, + is_label = True) +``` + +5) Save vocabulary and label models: +``` +vocab.save_vocabs(save_dir, input_vocab, 'ner-vocab-input.pkl') +vocab.save_vocabs(save_dir, label_vocab, 'ner-vocab-label.pkl') +vocab.save_vocabs(save_dir, postag_label_path, 'vocab-postag.pkl') +``` + +6) Read PosTagging pickle file, embedding and initializing PosTagging object: +``` +postag_bilstmcrf = torch.load(postag_model_path) +embedding = utils.load_fasttext_embeddings(wordembed_path, '') + +postag_model = PosTaggerPredict( + model=postag_bilstmcrf, + label_path=postag_label_path, + embedding=embedding) +``` + +7) Initialize BiLSTM-CRF model and set embedding: +``` +crf = nermodel.CRF( + vocab_size=len(label_vocab), + pad_idx=input_vocab.f2i[padding_string], + unk_idx=input_vocab.f2i[unknown_string], + device=device).to(device) + +bilstmcr_model = nermodel.LSTMCRF_NER( + device=device, + crf=crf, + vocab_sizes=[len(input_vocab), len(postag_model.label_vocab)], + word_dims=[word_dim, len(postag_model.label_vocab)], + hidden_dim=lstm_dim, + layers=lstm_layers, + dropout_prob=dropout_prob, + bidirectional=bidirectional, + alpha=alpha +).to(device) + +bilstmcr_model.reset_parameters() + +fasttext = postag_model.fasttext + +bilstmcr_model.embeddings[0].weight.data = torch.from_numpy(fasttext[input_vocab.i2f.values()]) +bilstmcr_model.embeddings[0].weight.requires_grad = False + +utils.load_postag_representation(bilstmcr_model.embeddings[1], postag_model.label_vocab) +bilstmcr_model.embeddings[1].weight.requires_grad = False +``` +8) Initialize LSTMCRFTrainer object: +``` +trainer = LSTMCRFTrainer( + bilstmcrf_model=bilstmcr_model, + epochs=epochs, + input_vocab=input_vocab, + input_path=input_path, + postag_model=postag_model, + postag_label_vocab=postag_model.label_vocab, + label_vocab=label_vocab, + save_dir=save_dir, + ckpt_period=utils.PeriodChecker(ckpt_period), + val=val, + val_period=utils.PeriodChecker(val_period), + samples=samples, + pad_string=padding_string, + unk_string=unknown_string, + batch_size=batch_size, + shuffle=shuffle, + label_column=label_column, + encoding=encoding, + separator=separator, + use_pre_processing=use_pre_processing, + learning_rate=learning_rate, + learning_rate_decay=learning_rate_decay, + max_patience=max_patience, + max_decay_num=max_decay_num, + patience_threshold=patience_threshold, + val_path=val_path, + tensorboard=None) +``` +9) Train the model: +``` +trainer.train() +``` + +## Prediction + +The prediction could be done in two ways, with a single sentence or +a batch of sentences. + +### Single Prediction + +To predict a single sentence, the method **predict_line** should be used. +Example of initialization e usage: + +**Important**: before label some sentence, it's needed to make some steps: +1) Import main packages; +2) Initialize model variables; +3) Read PosTagging, NER model and embedding model; +4) Initialize and usage. + +An example of the above steps could be found in the python code below: + +1) Import main packages: +``` + import torch + + import TakeBlipNer.utils as utils + from TakeBlipNer.predict import NerPredict +``` +2) Initialize model variables: + +In order to predict the sentences tags, the following variables should be +created: +- **postag_model_path**: string with the path of PosTagging pickle model; +- **postag_label_path**: string with the path of PosTagging pickle labels; +- **ner_model_path**: string with the path of NER pickle model; +- **ner_label_path**: string with the path of NER pickle labels; +- **wordembed_path**: string with FastText embedding files +- **save_dir**: string with path and file name which will be used to + save predicted sentences (for batch prediction); +- **padding_string**: string which represents the pad token; +- **encoding**: string with the encoding used in sentence; +- **separator**: string with file separator (for batch prediction); +- **sentence**: string with sentence to be labeled. + +Example of variables creation: +``` +postag_model_path = '*.pkl' +postag_label_path = '*.pkl' +ner_label_path = '*.pkl' +ner_model_path = '*.pkl' +wordembed_path = '*.kv' +save_dir = '*.csv' +padding_string = '' +encoding = 'utf-8' +separator = '|' +sentence = 'SENTENCE EXAMPLE TO PREDICT' + +``` +3) Read PosTagging, NER model and embedding model: +``` +bilstmcrf = torch.load(model_path) +embedding = utils.load_fasttext_embeddings(wordembed_path, padding) +``` +4) Initialize and usage: + +``` +ner_predicter = NerPredict( + model=bilstmcrf, + label_path=label_vocab, + embedding=embedding, + save_dir=save_dir, + encoding=encoding, + separator=separator) + +print(ner_predicter.predict_line(sentence)) +``` + +### Batch Prediction + +To predict a batch of sentences in a .csv file, another set of variables should +be created and passed to **predict_batch** method. The variables are the following: +- **input_path**: a string with path of the .csv file; +- **sentence_column**: a string with column name of .csv file; +- **unknown_string**: a string which represents unknown token; +- **batch_size**: number of sentences which will be predicted at the same time; +- **shuffle**: a boolean representing if the dataset is shuffled; +- **use_pre_processing**: a boolean indicating if sentence will be preprocessed; +- **output_lstm**: a boolean indicating if LSTM prediction will be saved. + +Example of initialization e usage of **predict_batch** method: +``` +input_path = '*.csv' +sentence_column = '*' +unknown = '' +batch_size = 64 +shuffle = False +use_pre_processing = True +output_lstm = True + +ner_predicter.predict_batch( + filepath=input_path, + sentence_column=sentence_column, + pad_string=padding_string, + unk_string=unknown_string, + batch_size=batch_size, + shuffle=shuffle, + use_pre_processing=use_pre_processing, + output_lstm=output_lstm) +``` +The batch sentences prediction will be saved in the given **save_dir** path. + + + +%package -n python3-TakeBlipNer +Summary: Named Entity Recognition Package +Provides: python-TakeBlipNer +BuildRequires: python3-devel +BuildRequires: python3-setuptools +BuildRequires: python3-pip +%description -n python3-TakeBlipNer +# TakeBlipNer Package +_Data & Analytics Research_ + +## Overview + +NER (Named Entity Recognition) is an NLP problem that aims to locate and classify entities in a text. +This implementation uses BiLSTM-CRF for solving the NER task utilizing PyTorch framework for training a supervised model and predicting in CPU. +For training, it receives a pre-trained FastText Gensim embedding, a PosTagging Model and a .csv file. It outputs three pickle files: model, word vocabulary and label vocabulary. +Example of classes that can be trained: + +- Financial [FIN] +- Generic [GEN] +- Company [COMP] +- Number [NUMBER] +- Document [DOC] +- Location [LOC] +- Person [PERS] +- Phone [PHONE] +- Address [ADDR] +- Email [EMAIL] +- Date [DATE] +- Week Day [WD] +- Money [MONEY] +- Relatives [REL] +- Vocatives [VOC] + +Some additional information is used to identify where the recognized entity begins and ends. + +- The letter B indicates the beginning of the CLASS class entity; +- The letter I indicates that the respective token is a continuation of the class with the name CLASS started; +- The letter O indicates that no entity related to the token was found; + +For example, the sentence "ligar internet a cabo!" would be classified as "O O B-GEN I-GEN I-GEN", +where B-GEN represents the beginning of the GEN entity (token "internet") and the next two tokens are +the continuation of the entity (tokens "a cabo"). The entity found in the sentence would be +"internet a cabo" of the GEN class. + +Here are presented these content: + +## Training NER Model +To train you own NER model using this package, some steps should +be made before: +1) Import main packages +2) Initialize file variables: embedding, postagging, train and validation .csv files; +3) Initialize NER parameters; +4) Instantiate vocabulary and label vocabulary objects; +5) Save vocabulary and label models; +6) Read PosTagging pickle file, embedding and initializing PosTagging object; +7) Initialize BiLSTM-CRF model and set embedding; +8) Initialize LSTMCRFTrainer object; +9) Train the model. + +An example of the above steps could be found in the python code below: + +1) Import main packages: + ``` +import torch + +from TakeBlipNer import utils, vocab, nermodel +from TakeBlipNer.train import LSTMCRFTrainer +from TakeBlipPosTagger.predict import PosTaggerPredict + ``` +2) Initialize file variables: +``` +wordembed_path = '*.kv' +save_dir = '*' +input_path = '*.csv' +val_path = '*.csv' +postag_model_path = '*.pkl' +postag_label_path = '*.pkl' +``` +3) Initialize NER parameters + +In order to train a model, the following variables should be created: + +- **sentence_column**: String with sentence column name in train file +- **unknown_string**: String which represents unknown token; +- **padding_string**: String which represents the pad token; +- **batch_size**: Number of sentences Number of samples that will be propagated through the network; +- **shuffle**: Boolean representing whether the dataset is shuffled. +- **use_pre_processing**: Boolean indicating whether the sentence will be preprocessed +- **separator**: String with file separator (for batch prediction); +- **encoding**: String with the encoding used in sentence; +- **save_dir**: String with directory to save outputs (checkpoints, vocabs, etc.); +- **device**: String where train will occur (cpu or gpu); +- **word_dim**: Integer with dimensions of word embeddings; +- **lstm_dim**: Integer with dimensions of lstm cells. This determines the hidden state and cell state sizes; +- **lstm_layers**: Integer with layers of lstm cells; +- **dropout_prob**: Float with probability in dropout layers; +- **bidirectional**: Boolean whether lstm cells are bidirectional; +- **alpha**: Float representing L2 penalization parameter; +- **epochs**: Integer with number of training epochs; +- **ckpt_period**: Period to wait until a model checkpoint is saved to the disk. Periods are specified by an integer and a unit ("e": 'epoch, "i": iteration, "s": global step); +- **val**: Integer whether to perform validation; +- **val_period**: Period to wait until a validation is performed. Periods are specified by an integer and a unit ("e": epoch, "i": iteration, "s": global step); +- **samples**: Integer with number of output samples to display at each validation; +- **learning_rate**: Float representing learning rate parameter; +- **learning_rate_decay**: Float representing learning rate decay parameter; +- **max_patience**: Integer with max patience parameter; +- **max_decay_num**: Integer with max decay parameter; +- **patience_threshold**: Float representing threshold of loss for patience count. + + +Example of parameters creation: +``` +sentence_column = 'Message' +label_column = 'Tags' +unknown_string = '' +padding_string = '' +batch_size = 64 +shuffle = False +use_pre_processing = True +separator = '|' +encoding = 'utf-8' +device = 'cpu' +word_dim = 300 +lstm_dim = 300 +lstm_layers = 1 +dropout_prob = 0.05 +bidirectional = False +alpha = 0.5 +epochs = 10 +ckpt_period = '1e' +val = True +val_period = '1e' +samples = 10 +learning_rate = 0.001 +learning_rate_decay = 0.01 +max_patience = 5 +max_decay_num = 5 +patience_threshold = 0.98 +device = 'cpu' +``` +4) Instantiate vocabulary and label vocabulary objects: +``` +input_vocab = vocab.create_vocabulary( + input_path=input_path, + column_name=sentence_column, + pad_string=padding_string, + unk_string=unknown_string, + encoding=encoding, + separator=separator, + use_pre_processing=use_pre_processing) + +label_vocab = vocab.create_vocabulary( + input_path = input_path, + column_name =label_column, + pad_string = padding_string, + unk_string = unknown_string, + encoding = encoding, + separator = separator, + is_label = True) +``` + +5) Save vocabulary and label models: +``` +vocab.save_vocabs(save_dir, input_vocab, 'ner-vocab-input.pkl') +vocab.save_vocabs(save_dir, label_vocab, 'ner-vocab-label.pkl') +vocab.save_vocabs(save_dir, postag_label_path, 'vocab-postag.pkl') +``` + +6) Read PosTagging pickle file, embedding and initializing PosTagging object: +``` +postag_bilstmcrf = torch.load(postag_model_path) +embedding = utils.load_fasttext_embeddings(wordembed_path, '') + +postag_model = PosTaggerPredict( + model=postag_bilstmcrf, + label_path=postag_label_path, + embedding=embedding) +``` + +7) Initialize BiLSTM-CRF model and set embedding: +``` +crf = nermodel.CRF( + vocab_size=len(label_vocab), + pad_idx=input_vocab.f2i[padding_string], + unk_idx=input_vocab.f2i[unknown_string], + device=device).to(device) + +bilstmcr_model = nermodel.LSTMCRF_NER( + device=device, + crf=crf, + vocab_sizes=[len(input_vocab), len(postag_model.label_vocab)], + word_dims=[word_dim, len(postag_model.label_vocab)], + hidden_dim=lstm_dim, + layers=lstm_layers, + dropout_prob=dropout_prob, + bidirectional=bidirectional, + alpha=alpha +).to(device) + +bilstmcr_model.reset_parameters() + +fasttext = postag_model.fasttext + +bilstmcr_model.embeddings[0].weight.data = torch.from_numpy(fasttext[input_vocab.i2f.values()]) +bilstmcr_model.embeddings[0].weight.requires_grad = False + +utils.load_postag_representation(bilstmcr_model.embeddings[1], postag_model.label_vocab) +bilstmcr_model.embeddings[1].weight.requires_grad = False +``` +8) Initialize LSTMCRFTrainer object: +``` +trainer = LSTMCRFTrainer( + bilstmcrf_model=bilstmcr_model, + epochs=epochs, + input_vocab=input_vocab, + input_path=input_path, + postag_model=postag_model, + postag_label_vocab=postag_model.label_vocab, + label_vocab=label_vocab, + save_dir=save_dir, + ckpt_period=utils.PeriodChecker(ckpt_period), + val=val, + val_period=utils.PeriodChecker(val_period), + samples=samples, + pad_string=padding_string, + unk_string=unknown_string, + batch_size=batch_size, + shuffle=shuffle, + label_column=label_column, + encoding=encoding, + separator=separator, + use_pre_processing=use_pre_processing, + learning_rate=learning_rate, + learning_rate_decay=learning_rate_decay, + max_patience=max_patience, + max_decay_num=max_decay_num, + patience_threshold=patience_threshold, + val_path=val_path, + tensorboard=None) +``` +9) Train the model: +``` +trainer.train() +``` + +## Prediction + +The prediction could be done in two ways, with a single sentence or +a batch of sentences. + +### Single Prediction + +To predict a single sentence, the method **predict_line** should be used. +Example of initialization e usage: + +**Important**: before label some sentence, it's needed to make some steps: +1) Import main packages; +2) Initialize model variables; +3) Read PosTagging, NER model and embedding model; +4) Initialize and usage. + +An example of the above steps could be found in the python code below: + +1) Import main packages: +``` + import torch + + import TakeBlipNer.utils as utils + from TakeBlipNer.predict import NerPredict +``` +2) Initialize model variables: + +In order to predict the sentences tags, the following variables should be +created: +- **postag_model_path**: string with the path of PosTagging pickle model; +- **postag_label_path**: string with the path of PosTagging pickle labels; +- **ner_model_path**: string with the path of NER pickle model; +- **ner_label_path**: string with the path of NER pickle labels; +- **wordembed_path**: string with FastText embedding files +- **save_dir**: string with path and file name which will be used to + save predicted sentences (for batch prediction); +- **padding_string**: string which represents the pad token; +- **encoding**: string with the encoding used in sentence; +- **separator**: string with file separator (for batch prediction); +- **sentence**: string with sentence to be labeled. + +Example of variables creation: +``` +postag_model_path = '*.pkl' +postag_label_path = '*.pkl' +ner_label_path = '*.pkl' +ner_model_path = '*.pkl' +wordembed_path = '*.kv' +save_dir = '*.csv' +padding_string = '' +encoding = 'utf-8' +separator = '|' +sentence = 'SENTENCE EXAMPLE TO PREDICT' + +``` +3) Read PosTagging, NER model and embedding model: +``` +bilstmcrf = torch.load(model_path) +embedding = utils.load_fasttext_embeddings(wordembed_path, padding) +``` +4) Initialize and usage: + +``` +ner_predicter = NerPredict( + model=bilstmcrf, + label_path=label_vocab, + embedding=embedding, + save_dir=save_dir, + encoding=encoding, + separator=separator) + +print(ner_predicter.predict_line(sentence)) +``` + +### Batch Prediction + +To predict a batch of sentences in a .csv file, another set of variables should +be created and passed to **predict_batch** method. The variables are the following: +- **input_path**: a string with path of the .csv file; +- **sentence_column**: a string with column name of .csv file; +- **unknown_string**: a string which represents unknown token; +- **batch_size**: number of sentences which will be predicted at the same time; +- **shuffle**: a boolean representing if the dataset is shuffled; +- **use_pre_processing**: a boolean indicating if sentence will be preprocessed; +- **output_lstm**: a boolean indicating if LSTM prediction will be saved. + +Example of initialization e usage of **predict_batch** method: +``` +input_path = '*.csv' +sentence_column = '*' +unknown = '' +batch_size = 64 +shuffle = False +use_pre_processing = True +output_lstm = True + +ner_predicter.predict_batch( + filepath=input_path, + sentence_column=sentence_column, + pad_string=padding_string, + unk_string=unknown_string, + batch_size=batch_size, + shuffle=shuffle, + use_pre_processing=use_pre_processing, + output_lstm=output_lstm) +``` +The batch sentences prediction will be saved in the given **save_dir** path. + + + +%package help +Summary: Development documents and examples for TakeBlipNer +Provides: python3-TakeBlipNer-doc +%description help +# TakeBlipNer Package +_Data & Analytics Research_ + +## Overview + +NER (Named Entity Recognition) is an NLP problem that aims to locate and classify entities in a text. +This implementation uses BiLSTM-CRF for solving the NER task utilizing PyTorch framework for training a supervised model and predicting in CPU. +For training, it receives a pre-trained FastText Gensim embedding, a PosTagging Model and a .csv file. It outputs three pickle files: model, word vocabulary and label vocabulary. +Example of classes that can be trained: + +- Financial [FIN] +- Generic [GEN] +- Company [COMP] +- Number [NUMBER] +- Document [DOC] +- Location [LOC] +- Person [PERS] +- Phone [PHONE] +- Address [ADDR] +- Email [EMAIL] +- Date [DATE] +- Week Day [WD] +- Money [MONEY] +- Relatives [REL] +- Vocatives [VOC] + +Some additional information is used to identify where the recognized entity begins and ends. + +- The letter B indicates the beginning of the CLASS class entity; +- The letter I indicates that the respective token is a continuation of the class with the name CLASS started; +- The letter O indicates that no entity related to the token was found; + +For example, the sentence "ligar internet a cabo!" would be classified as "O O B-GEN I-GEN I-GEN", +where B-GEN represents the beginning of the GEN entity (token "internet") and the next two tokens are +the continuation of the entity (tokens "a cabo"). The entity found in the sentence would be +"internet a cabo" of the GEN class. + +Here are presented these content: + +## Training NER Model +To train you own NER model using this package, some steps should +be made before: +1) Import main packages +2) Initialize file variables: embedding, postagging, train and validation .csv files; +3) Initialize NER parameters; +4) Instantiate vocabulary and label vocabulary objects; +5) Save vocabulary and label models; +6) Read PosTagging pickle file, embedding and initializing PosTagging object; +7) Initialize BiLSTM-CRF model and set embedding; +8) Initialize LSTMCRFTrainer object; +9) Train the model. + +An example of the above steps could be found in the python code below: + +1) Import main packages: + ``` +import torch + +from TakeBlipNer import utils, vocab, nermodel +from TakeBlipNer.train import LSTMCRFTrainer +from TakeBlipPosTagger.predict import PosTaggerPredict + ``` +2) Initialize file variables: +``` +wordembed_path = '*.kv' +save_dir = '*' +input_path = '*.csv' +val_path = '*.csv' +postag_model_path = '*.pkl' +postag_label_path = '*.pkl' +``` +3) Initialize NER parameters + +In order to train a model, the following variables should be created: + +- **sentence_column**: String with sentence column name in train file +- **unknown_string**: String which represents unknown token; +- **padding_string**: String which represents the pad token; +- **batch_size**: Number of sentences Number of samples that will be propagated through the network; +- **shuffle**: Boolean representing whether the dataset is shuffled. +- **use_pre_processing**: Boolean indicating whether the sentence will be preprocessed +- **separator**: String with file separator (for batch prediction); +- **encoding**: String with the encoding used in sentence; +- **save_dir**: String with directory to save outputs (checkpoints, vocabs, etc.); +- **device**: String where train will occur (cpu or gpu); +- **word_dim**: Integer with dimensions of word embeddings; +- **lstm_dim**: Integer with dimensions of lstm cells. This determines the hidden state and cell state sizes; +- **lstm_layers**: Integer with layers of lstm cells; +- **dropout_prob**: Float with probability in dropout layers; +- **bidirectional**: Boolean whether lstm cells are bidirectional; +- **alpha**: Float representing L2 penalization parameter; +- **epochs**: Integer with number of training epochs; +- **ckpt_period**: Period to wait until a model checkpoint is saved to the disk. Periods are specified by an integer and a unit ("e": 'epoch, "i": iteration, "s": global step); +- **val**: Integer whether to perform validation; +- **val_period**: Period to wait until a validation is performed. Periods are specified by an integer and a unit ("e": epoch, "i": iteration, "s": global step); +- **samples**: Integer with number of output samples to display at each validation; +- **learning_rate**: Float representing learning rate parameter; +- **learning_rate_decay**: Float representing learning rate decay parameter; +- **max_patience**: Integer with max patience parameter; +- **max_decay_num**: Integer with max decay parameter; +- **patience_threshold**: Float representing threshold of loss for patience count. + + +Example of parameters creation: +``` +sentence_column = 'Message' +label_column = 'Tags' +unknown_string = '' +padding_string = '' +batch_size = 64 +shuffle = False +use_pre_processing = True +separator = '|' +encoding = 'utf-8' +device = 'cpu' +word_dim = 300 +lstm_dim = 300 +lstm_layers = 1 +dropout_prob = 0.05 +bidirectional = False +alpha = 0.5 +epochs = 10 +ckpt_period = '1e' +val = True +val_period = '1e' +samples = 10 +learning_rate = 0.001 +learning_rate_decay = 0.01 +max_patience = 5 +max_decay_num = 5 +patience_threshold = 0.98 +device = 'cpu' +``` +4) Instantiate vocabulary and label vocabulary objects: +``` +input_vocab = vocab.create_vocabulary( + input_path=input_path, + column_name=sentence_column, + pad_string=padding_string, + unk_string=unknown_string, + encoding=encoding, + separator=separator, + use_pre_processing=use_pre_processing) + +label_vocab = vocab.create_vocabulary( + input_path = input_path, + column_name =label_column, + pad_string = padding_string, + unk_string = unknown_string, + encoding = encoding, + separator = separator, + is_label = True) +``` + +5) Save vocabulary and label models: +``` +vocab.save_vocabs(save_dir, input_vocab, 'ner-vocab-input.pkl') +vocab.save_vocabs(save_dir, label_vocab, 'ner-vocab-label.pkl') +vocab.save_vocabs(save_dir, postag_label_path, 'vocab-postag.pkl') +``` + +6) Read PosTagging pickle file, embedding and initializing PosTagging object: +``` +postag_bilstmcrf = torch.load(postag_model_path) +embedding = utils.load_fasttext_embeddings(wordembed_path, '') + +postag_model = PosTaggerPredict( + model=postag_bilstmcrf, + label_path=postag_label_path, + embedding=embedding) +``` + +7) Initialize BiLSTM-CRF model and set embedding: +``` +crf = nermodel.CRF( + vocab_size=len(label_vocab), + pad_idx=input_vocab.f2i[padding_string], + unk_idx=input_vocab.f2i[unknown_string], + device=device).to(device) + +bilstmcr_model = nermodel.LSTMCRF_NER( + device=device, + crf=crf, + vocab_sizes=[len(input_vocab), len(postag_model.label_vocab)], + word_dims=[word_dim, len(postag_model.label_vocab)], + hidden_dim=lstm_dim, + layers=lstm_layers, + dropout_prob=dropout_prob, + bidirectional=bidirectional, + alpha=alpha +).to(device) + +bilstmcr_model.reset_parameters() + +fasttext = postag_model.fasttext + +bilstmcr_model.embeddings[0].weight.data = torch.from_numpy(fasttext[input_vocab.i2f.values()]) +bilstmcr_model.embeddings[0].weight.requires_grad = False + +utils.load_postag_representation(bilstmcr_model.embeddings[1], postag_model.label_vocab) +bilstmcr_model.embeddings[1].weight.requires_grad = False +``` +8) Initialize LSTMCRFTrainer object: +``` +trainer = LSTMCRFTrainer( + bilstmcrf_model=bilstmcr_model, + epochs=epochs, + input_vocab=input_vocab, + input_path=input_path, + postag_model=postag_model, + postag_label_vocab=postag_model.label_vocab, + label_vocab=label_vocab, + save_dir=save_dir, + ckpt_period=utils.PeriodChecker(ckpt_period), + val=val, + val_period=utils.PeriodChecker(val_period), + samples=samples, + pad_string=padding_string, + unk_string=unknown_string, + batch_size=batch_size, + shuffle=shuffle, + label_column=label_column, + encoding=encoding, + separator=separator, + use_pre_processing=use_pre_processing, + learning_rate=learning_rate, + learning_rate_decay=learning_rate_decay, + max_patience=max_patience, + max_decay_num=max_decay_num, + patience_threshold=patience_threshold, + val_path=val_path, + tensorboard=None) +``` +9) Train the model: +``` +trainer.train() +``` + +## Prediction + +The prediction could be done in two ways, with a single sentence or +a batch of sentences. + +### Single Prediction + +To predict a single sentence, the method **predict_line** should be used. +Example of initialization e usage: + +**Important**: before label some sentence, it's needed to make some steps: +1) Import main packages; +2) Initialize model variables; +3) Read PosTagging, NER model and embedding model; +4) Initialize and usage. + +An example of the above steps could be found in the python code below: + +1) Import main packages: +``` + import torch + + import TakeBlipNer.utils as utils + from TakeBlipNer.predict import NerPredict +``` +2) Initialize model variables: + +In order to predict the sentences tags, the following variables should be +created: +- **postag_model_path**: string with the path of PosTagging pickle model; +- **postag_label_path**: string with the path of PosTagging pickle labels; +- **ner_model_path**: string with the path of NER pickle model; +- **ner_label_path**: string with the path of NER pickle labels; +- **wordembed_path**: string with FastText embedding files +- **save_dir**: string with path and file name which will be used to + save predicted sentences (for batch prediction); +- **padding_string**: string which represents the pad token; +- **encoding**: string with the encoding used in sentence; +- **separator**: string with file separator (for batch prediction); +- **sentence**: string with sentence to be labeled. + +Example of variables creation: +``` +postag_model_path = '*.pkl' +postag_label_path = '*.pkl' +ner_label_path = '*.pkl' +ner_model_path = '*.pkl' +wordembed_path = '*.kv' +save_dir = '*.csv' +padding_string = '' +encoding = 'utf-8' +separator = '|' +sentence = 'SENTENCE EXAMPLE TO PREDICT' + +``` +3) Read PosTagging, NER model and embedding model: +``` +bilstmcrf = torch.load(model_path) +embedding = utils.load_fasttext_embeddings(wordembed_path, padding) +``` +4) Initialize and usage: + +``` +ner_predicter = NerPredict( + model=bilstmcrf, + label_path=label_vocab, + embedding=embedding, + save_dir=save_dir, + encoding=encoding, + separator=separator) + +print(ner_predicter.predict_line(sentence)) +``` + +### Batch Prediction + +To predict a batch of sentences in a .csv file, another set of variables should +be created and passed to **predict_batch** method. The variables are the following: +- **input_path**: a string with path of the .csv file; +- **sentence_column**: a string with column name of .csv file; +- **unknown_string**: a string which represents unknown token; +- **batch_size**: number of sentences which will be predicted at the same time; +- **shuffle**: a boolean representing if the dataset is shuffled; +- **use_pre_processing**: a boolean indicating if sentence will be preprocessed; +- **output_lstm**: a boolean indicating if LSTM prediction will be saved. + +Example of initialization e usage of **predict_batch** method: +``` +input_path = '*.csv' +sentence_column = '*' +unknown = '' +batch_size = 64 +shuffle = False +use_pre_processing = True +output_lstm = True + +ner_predicter.predict_batch( + filepath=input_path, + sentence_column=sentence_column, + pad_string=padding_string, + unk_string=unknown_string, + batch_size=batch_size, + shuffle=shuffle, + use_pre_processing=use_pre_processing, + output_lstm=output_lstm) +``` +The batch sentences prediction will be saved in the given **save_dir** path. + + + +%prep +%autosetup -n TakeBlipNer-0.0.6 + +%build +%py3_build + +%install +%py3_install +install -d -m755 %{buildroot}/%{_pkgdocdir} +if [ -d doc ]; then cp -arf doc %{buildroot}/%{_pkgdocdir}; fi +if [ -d docs ]; then cp -arf docs %{buildroot}/%{_pkgdocdir}; fi +if [ -d example ]; then cp -arf example %{buildroot}/%{_pkgdocdir}; fi +if [ -d examples ]; then cp -arf examples %{buildroot}/%{_pkgdocdir}; fi +pushd %{buildroot} +if [ -d usr/lib ]; then + find usr/lib -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/lib64 ]; then + find usr/lib64 -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/bin ]; then + find usr/bin -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/sbin ]; then + find usr/sbin -type f -printf "/%h/%f\n" >> filelist.lst +fi +touch doclist.lst +if [ -d usr/share/man ]; then + find usr/share/man -type f -printf "/%h/%f.gz\n" >> doclist.lst +fi +popd +mv %{buildroot}/filelist.lst . +mv %{buildroot}/doclist.lst . + +%files -n python3-TakeBlipNer -f filelist.lst +%dir %{python3_sitelib}/* + +%files help -f doclist.lst +%{_docdir}/* + +%changelog +* Mon May 29 2023 Python_Bot - 0.0.6-1 +- Package Spec generated diff --git a/sources b/sources new file mode 100644 index 0000000..4a25719 --- /dev/null +++ b/sources @@ -0,0 +1 @@ +93b8cb74b0fea9c34f92543979a317ca TakeBlipNer-0.0.6.tar.gz -- cgit v1.2.3