From 030be3a6abf6fcceef0979a2da24c81ad8a2c71a Mon Sep 17 00:00:00 2001 From: CoprDistGit Date: Tue, 20 Jun 2023 06:26:10 +0000 Subject: automatic import of python-kobert-transformers --- .gitignore | 1 + python-kobert-transformers.spec | 516 ++++++++++++++++++++++++++++++++++++++++ sources | 1 + 3 files changed, 518 insertions(+) create mode 100644 python-kobert-transformers.spec create mode 100644 sources diff --git a/.gitignore b/.gitignore index e69de29..7803755 100644 --- a/.gitignore +++ b/.gitignore @@ -0,0 +1 @@ +/kobert-transformers-0.5.1.tar.gz diff --git a/python-kobert-transformers.spec b/python-kobert-transformers.spec new file mode 100644 index 0000000..d7e6358 --- /dev/null +++ b/python-kobert-transformers.spec @@ -0,0 +1,516 @@ +%global _empty_manifest_terminate_build 0 +Name: python-kobert-transformers +Version: 0.5.1 +Release: 1 +Summary: Transformers library for KoBERT, DistilKoBERT +License: Apache License 2.0 +URL: https://github.com/monologg/KoBERT-Transformers +Source0: https://mirrors.aliyun.com/pypi/web/packages/36/04/3b41292198e1c7429c2104dcb05b8912ddb18582c8021b324d233313a807/kobert-transformers-0.5.1.tar.gz +BuildArch: noarch + +Requires: python3-torch +Requires: python3-transformers +Requires: python3-sentencepiece + +%description +# KoBERT-Transformers + +`KoBERT` & `DistilKoBERT` on ๐Ÿค— Huggingface Transformers ๐Ÿค— + +KoBERT ๋ชจ๋ธ์€ [๊ณต์‹ ๋ ˆํฌ](https://github.com/SKTBrain/KoBERT)์˜ ๊ฒƒ๊ณผ ๋™์ผํ•ฉ๋‹ˆ๋‹ค. ๋ณธ ๋ ˆํฌ๋Š” **Huggingface tokenizer์˜ ๋ชจ๋“  API๋ฅผ ์ง€์›**ํ•˜๊ธฐ ์œ„ํ•ด์„œ ์ œ์ž‘๋˜์—ˆ์Šต๋‹ˆ๋‹ค. + +## ๐Ÿšจ ์ค‘์š”! ๐Ÿšจ + +### ๐Ÿ™ TL;DR + +1. `transformers` ๋Š” `v3.0` ์ด์ƒ์„ ๋ฐ˜๋“œ์‹œ ์„ค์น˜! +2. `tokenizer`๋Š” ๋ณธ ๋ ˆํฌ์˜ `kobert_transformers/tokenization_kobert.py`๋ฅผ ์‚ฌ์šฉ! + +### 1. Tokenizer ํ˜ธํ™˜ + +`Huggingface Transformers`๊ฐ€ `v2.9.0`๋ถ€ํ„ฐ tokenization ๊ด€๋ จ API๊ฐ€ ์ผ๋ถ€ ๋ณ€๊ฒฝ๋˜์—ˆ์Šต๋‹ˆ๋‹ค. ์ด์— ๋งž์ถฐ ๊ธฐ์กด์˜ `tokenization_kobert.py`๋ฅผ ์ƒ์œ„ ๋ฒ„์ „์— ๋งž๊ฒŒ ์ˆ˜์ •ํ•˜์˜€์Šต๋‹ˆ๋‹ค. + +### 2. Embedding์˜ padding_idx ์ด์Šˆ + +์ด์ „๋ถ€ํ„ฐ `BertModel`์˜ `BertEmbeddings`์—์„œ `padding_idx=0`์œผ๋กœ **Hard-coding**๋˜์–ด ์žˆ์—ˆ์Šต๋‹ˆ๋‹ค. (์•„๋ž˜ ์ฝ”๋“œ ์ฐธ๊ณ ) + +```python +class BertEmbeddings(nn.Module): + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) +``` + +๊ทธ๋Ÿฌ๋‚˜ Sentencepiece์˜ ๊ฒฝ์šฐ ๊ธฐ๋ณธ๊ฐ’์œผ๋กœ `pad_token_id=1`, `unk_token_id=0`์œผ๋กœ ์„ค์ •์ด ๋˜์–ด ์žˆ๊ณ  (์ด๋Š” KoBERT๋„ ๋™์ผ), ์ด๋ฅผ ๊ทธ๋Œ€๋กœ ์‚ฌ์šฉํ•˜๋Š” BertModel์˜ ๊ฒฝ์šฐ ์›์น˜ ์•Š์€ ๊ฒฐ๊ณผ๋ฅผ ๊ฐ€์ ธ์˜ฌ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค. + +Huggingface์—์„œ๋„ ์ตœ๊ทผ์— ํ•ด๋‹น ์ด์Šˆ๋ฅผ ์ธ์ง€ํ•˜์—ฌ ์ด๋ฅผ ์ˆ˜์ •ํ•˜์—ฌ `v2.9.0`์— ๋ฐ˜์˜ํ•˜์˜€์Šต๋‹ˆ๋‹ค. ([๊ด€๋ จ PR #3793](https://github.com/huggingface/transformers/pull/3793)) config์— `pad_token_id=1` ์„ ์ถ”๊ฐ€ ๊ฐ€๋Šฅํ•˜์—ฌ ์ด๋ฅผ ํ•ด๊ฒฐํ•  ์ˆ˜ ์žˆ๊ฒŒ ํ•˜์˜€์Šต๋‹ˆ๋‹ค. + +```python +class BertEmbeddings(nn.Module): + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) +``` + +๊ทธ๋Ÿฌ๋‚˜ `v.2.9.0`์—์„œ `DistilBERT`, `ALBERT` ๋“ฑ์—๋Š” ์ด ์ด์Šˆ๊ฐ€ ํ•ด๊ฒฐ๋˜์ง€ ์•Š์•„ ์ง์ ‘ PR์„ ์˜ฌ๋ ค ์ฒ˜๋ฆฌํ•˜์˜€๊ณ  ([๊ด€๋ จ PR #3965](https://github.com/huggingface/transformers/pull/3965)), **`v2.9.1`์— ์ตœ์ข…์ ์œผ๋กœ ๋ฐ˜์˜๋˜์–ด ๋ฐฐํฌ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.** + +์•„๋ž˜๋Š” ์ด์ „๊ณผ ํ˜„์žฌ ๋ฒ„์ „์˜ ์ฐจ์ด์ ์„ ๋ณด์—ฌ์ฃผ๋Š” ์ฝ”๋“œ์ž…๋‹ˆ๋‹ค. + +```python +# Transformers v2.7.0 +>>> from transformers import BertModel, DistilBertModel +>>> model = BertModel.from_pretrained("monologg/kobert") +>>> model.embeddings.word_embeddings +Embedding(8002, 768, padding_idx=0) +>>> model = DistilBertModel.from_pretrained("monologg/distilkobert") +>>> model.embeddings.word_embeddings +Embedding(8002, 768, padding_idx=0) + + +### Transformers v2.9.1 +>>> from transformers import BertModel, DistilBertModel +>>> model = BertModel.from_pretrained("monologg/kobert") +>>> model.embeddings.word_embeddings +Embedding(8002, 768, padding_idx=1) +>>> model = DistilBertModel.from_pretrained("monologg/distilkobert") +>>> model.embeddings.word_embeddings +Embedding(8002, 768, padding_idx=1) +``` + +## KoBERT / DistilKoBERT on ๐Ÿค— Transformers ๐Ÿค— + +### Dependencies + +- torch>=1.1.0 +- transformers>=3,<5 + +### How to Use + +```python +>>> from transformers import BertModel, DistilBertModel +>>> bert_model = BertModel.from_pretrained('monologg/kobert') +>>> distilbert_model = DistilBertModel.from_pretrained('monologg/distilkobert') +``` + +**Tokenizer๋ฅผ ์‚ฌ์šฉํ•˜๋ ค๋ฉด, [`kobert_transformers/tokenization_kobert.py`](https://github.com/monologg/KoBERT-Transformers/blob/master/kobert_transformers/tokenization_kobert.py) ํŒŒ์ผ์„ ๋ณต์‚ฌํ•œ ํ›„, `KoBertTokenizer`๋ฅผ ์ž„ํฌํŠธํ•˜๋ฉด ๋ฉ๋‹ˆ๋‹ค.** + +- KoBERT์™€ DistilKoBERT ๋ชจ๋‘ ๋™์ผํ•œ ํ† ํฌ๋‚˜์ด์ €๋ฅผ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค. +- **๊ธฐ์กด KoBERT์˜ ๊ฒฝ์šฐ Special Token์ด ์ œ๋Œ€๋กœ ๋ถ„๋ฆฌ๋˜์ง€ ์•Š๋Š” ์ด์Šˆ**๊ฐ€ ์žˆ์–ด์„œ ํ•ด๋‹น ๋ถ€๋ถ„์„ ์ˆ˜์ •ํ•˜์—ฌ ๋ฐ˜์˜ํ•˜์˜€์Šต๋‹ˆ๋‹ค. ([Issue link](https://github.com/SKTBrain/KoBERT/issues/11)) + +```python +>>> from tokenization_kobert import KoBertTokenizer +>>> tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert') # monologg/distilkobert๋„ ๋™์ผ +>>> tokenizer.tokenize("[CLS] ํ•œ๊ตญ์–ด ๋ชจ๋ธ์„ ๊ณต์œ ํ•ฉ๋‹ˆ๋‹ค. [SEP]") +>>> ['[CLS]', 'โ–ํ•œ๊ตญ', '์–ด', 'โ–๋ชจ๋ธ', '์„', 'โ–๊ณต์œ ', 'ํ•ฉ๋‹ˆ๋‹ค', '.', '[SEP]'] +>>> tokenizer.convert_tokens_to_ids(['[CLS]', 'โ–ํ•œ๊ตญ', '์–ด', 'โ–๋ชจ๋ธ', '์„', 'โ–๊ณต์œ ', 'ํ•ฉ๋‹ˆ๋‹ค', '.', '[SEP]']) +>>> [2, 4958, 6855, 2046, 7088, 1050, 7843, 54, 3] +``` + +## Kobert-Transformers (Pip library) + +[![PyPI](https://img.shields.io/pypi/v/kobert-transformers)](https://pypi.org/project/kobert-transformers/) +[![license](https://img.shields.io/badge/license-Apache%202.0-red)](https://github.com/monologg/DistilKoBERT/blob/master/LICENSE) +[![Downloads](https://pepy.tech/badge/kobert-transformers)](https://pepy.tech/project/kobert-transformers) + +- `tokenization_kobert.py`๋ฅผ ๋žฉํ•‘ํ•œ ํŒŒ์ด์ฌ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ +- KoBERT, DistilKoBERT๋ฅผ Huggingface Transformers ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ํ˜•ํƒœ๋กœ ์ œ๊ณต +- `v0.5.1`์—์„œ๋Š” `transformers v3.0` ์ด์ƒ์œผ๋กœ ๊ธฐ๋ณธ ์„ค์น˜ํ•ฉ๋‹ˆ๋‹ค. (`transformers v4.0` ๊นŒ์ง€๋Š” ์ด์Šˆ ์—†์ด ์‚ฌ์šฉ ๊ฐ€๋Šฅ) + +### Install Kobert-Transformers + +```bash +pip3 install kobert-transformers +``` + +### How to Use + +```python +>>> import torch +>>> from kobert_transformers import get_kobert_model, get_distilkobert_model +>>> model = get_kobert_model() +>>> model.eval() +>>> input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) +>>> attention_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) +>>> token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) +>>> sequence_output, pooled_output = model(input_ids, attention_mask, token_type_ids) +>>> sequence_output[0] +tensor([[-0.2461, 0.2428, 0.2590, ..., -0.4861, -0.0731, 0.0756], + [-0.2478, 0.2420, 0.2552, ..., -0.4877, -0.0727, 0.0754], + [-0.2472, 0.2420, 0.2561, ..., -0.4874, -0.0733, 0.0765]], + grad_fn=) +``` + +```python +>>> from kobert_transformers import get_tokenizer +>>> tokenizer = get_tokenizer() +>>> tokenizer.tokenize("[CLS] ํ•œ๊ตญ์–ด ๋ชจ๋ธ์„ ๊ณต์œ ํ•ฉ๋‹ˆ๋‹ค. [SEP]") +['[CLS]', 'โ–ํ•œ๊ตญ', '์–ด', 'โ–๋ชจ๋ธ', '์„', 'โ–๊ณต์œ ', 'ํ•ฉ๋‹ˆ๋‹ค', '.', '[SEP]'] +>>> tokenizer.convert_tokens_to_ids(['[CLS]', 'โ–ํ•œ๊ตญ', '์–ด', 'โ–๋ชจ๋ธ', '์„', 'โ–๊ณต์œ ', 'ํ•ฉ๋‹ˆ๋‹ค', '.', '[SEP]']) +[2, 4958, 6855, 2046, 7088, 1050, 7843, 54, 3] +``` + +## Reference + +- [KoBERT](https://github.com/SKTBrain/KoBERT) +- [DistilKoBERT](https://github.com/monologg/DistilKoBERT) +- [Huggingface Transformers](https://github.com/huggingface/transformers) + + + + +%package -n python3-kobert-transformers +Summary: Transformers library for KoBERT, DistilKoBERT +Provides: python-kobert-transformers +BuildRequires: python3-devel +BuildRequires: python3-setuptools +BuildRequires: python3-pip +%description -n python3-kobert-transformers +# KoBERT-Transformers + +`KoBERT` & `DistilKoBERT` on ๐Ÿค— Huggingface Transformers ๐Ÿค— + +KoBERT ๋ชจ๋ธ์€ [๊ณต์‹ ๋ ˆํฌ](https://github.com/SKTBrain/KoBERT)์˜ ๊ฒƒ๊ณผ ๋™์ผํ•ฉ๋‹ˆ๋‹ค. ๋ณธ ๋ ˆํฌ๋Š” **Huggingface tokenizer์˜ ๋ชจ๋“  API๋ฅผ ์ง€์›**ํ•˜๊ธฐ ์œ„ํ•ด์„œ ์ œ์ž‘๋˜์—ˆ์Šต๋‹ˆ๋‹ค. + +## ๐Ÿšจ ์ค‘์š”! ๐Ÿšจ + +### ๐Ÿ™ TL;DR + +1. `transformers` ๋Š” `v3.0` ์ด์ƒ์„ ๋ฐ˜๋“œ์‹œ ์„ค์น˜! +2. `tokenizer`๋Š” ๋ณธ ๋ ˆํฌ์˜ `kobert_transformers/tokenization_kobert.py`๋ฅผ ์‚ฌ์šฉ! + +### 1. Tokenizer ํ˜ธํ™˜ + +`Huggingface Transformers`๊ฐ€ `v2.9.0`๋ถ€ํ„ฐ tokenization ๊ด€๋ จ API๊ฐ€ ์ผ๋ถ€ ๋ณ€๊ฒฝ๋˜์—ˆ์Šต๋‹ˆ๋‹ค. ์ด์— ๋งž์ถฐ ๊ธฐ์กด์˜ `tokenization_kobert.py`๋ฅผ ์ƒ์œ„ ๋ฒ„์ „์— ๋งž๊ฒŒ ์ˆ˜์ •ํ•˜์˜€์Šต๋‹ˆ๋‹ค. + +### 2. Embedding์˜ padding_idx ์ด์Šˆ + +์ด์ „๋ถ€ํ„ฐ `BertModel`์˜ `BertEmbeddings`์—์„œ `padding_idx=0`์œผ๋กœ **Hard-coding**๋˜์–ด ์žˆ์—ˆ์Šต๋‹ˆ๋‹ค. (์•„๋ž˜ ์ฝ”๋“œ ์ฐธ๊ณ ) + +```python +class BertEmbeddings(nn.Module): + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) +``` + +๊ทธ๋Ÿฌ๋‚˜ Sentencepiece์˜ ๊ฒฝ์šฐ ๊ธฐ๋ณธ๊ฐ’์œผ๋กœ `pad_token_id=1`, `unk_token_id=0`์œผ๋กœ ์„ค์ •์ด ๋˜์–ด ์žˆ๊ณ  (์ด๋Š” KoBERT๋„ ๋™์ผ), ์ด๋ฅผ ๊ทธ๋Œ€๋กœ ์‚ฌ์šฉํ•˜๋Š” BertModel์˜ ๊ฒฝ์šฐ ์›์น˜ ์•Š์€ ๊ฒฐ๊ณผ๋ฅผ ๊ฐ€์ ธ์˜ฌ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค. + +Huggingface์—์„œ๋„ ์ตœ๊ทผ์— ํ•ด๋‹น ์ด์Šˆ๋ฅผ ์ธ์ง€ํ•˜์—ฌ ์ด๋ฅผ ์ˆ˜์ •ํ•˜์—ฌ `v2.9.0`์— ๋ฐ˜์˜ํ•˜์˜€์Šต๋‹ˆ๋‹ค. ([๊ด€๋ จ PR #3793](https://github.com/huggingface/transformers/pull/3793)) config์— `pad_token_id=1` ์„ ์ถ”๊ฐ€ ๊ฐ€๋Šฅํ•˜์—ฌ ์ด๋ฅผ ํ•ด๊ฒฐํ•  ์ˆ˜ ์žˆ๊ฒŒ ํ•˜์˜€์Šต๋‹ˆ๋‹ค. + +```python +class BertEmbeddings(nn.Module): + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) +``` + +๊ทธ๋Ÿฌ๋‚˜ `v.2.9.0`์—์„œ `DistilBERT`, `ALBERT` ๋“ฑ์—๋Š” ์ด ์ด์Šˆ๊ฐ€ ํ•ด๊ฒฐ๋˜์ง€ ์•Š์•„ ์ง์ ‘ PR์„ ์˜ฌ๋ ค ์ฒ˜๋ฆฌํ•˜์˜€๊ณ  ([๊ด€๋ จ PR #3965](https://github.com/huggingface/transformers/pull/3965)), **`v2.9.1`์— ์ตœ์ข…์ ์œผ๋กœ ๋ฐ˜์˜๋˜์–ด ๋ฐฐํฌ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.** + +์•„๋ž˜๋Š” ์ด์ „๊ณผ ํ˜„์žฌ ๋ฒ„์ „์˜ ์ฐจ์ด์ ์„ ๋ณด์—ฌ์ฃผ๋Š” ์ฝ”๋“œ์ž…๋‹ˆ๋‹ค. + +```python +# Transformers v2.7.0 +>>> from transformers import BertModel, DistilBertModel +>>> model = BertModel.from_pretrained("monologg/kobert") +>>> model.embeddings.word_embeddings +Embedding(8002, 768, padding_idx=0) +>>> model = DistilBertModel.from_pretrained("monologg/distilkobert") +>>> model.embeddings.word_embeddings +Embedding(8002, 768, padding_idx=0) + + +### Transformers v2.9.1 +>>> from transformers import BertModel, DistilBertModel +>>> model = BertModel.from_pretrained("monologg/kobert") +>>> model.embeddings.word_embeddings +Embedding(8002, 768, padding_idx=1) +>>> model = DistilBertModel.from_pretrained("monologg/distilkobert") +>>> model.embeddings.word_embeddings +Embedding(8002, 768, padding_idx=1) +``` + +## KoBERT / DistilKoBERT on ๐Ÿค— Transformers ๐Ÿค— + +### Dependencies + +- torch>=1.1.0 +- transformers>=3,<5 + +### How to Use + +```python +>>> from transformers import BertModel, DistilBertModel +>>> bert_model = BertModel.from_pretrained('monologg/kobert') +>>> distilbert_model = DistilBertModel.from_pretrained('monologg/distilkobert') +``` + +**Tokenizer๋ฅผ ์‚ฌ์šฉํ•˜๋ ค๋ฉด, [`kobert_transformers/tokenization_kobert.py`](https://github.com/monologg/KoBERT-Transformers/blob/master/kobert_transformers/tokenization_kobert.py) ํŒŒ์ผ์„ ๋ณต์‚ฌํ•œ ํ›„, `KoBertTokenizer`๋ฅผ ์ž„ํฌํŠธํ•˜๋ฉด ๋ฉ๋‹ˆ๋‹ค.** + +- KoBERT์™€ DistilKoBERT ๋ชจ๋‘ ๋™์ผํ•œ ํ† ํฌ๋‚˜์ด์ €๋ฅผ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค. +- **๊ธฐ์กด KoBERT์˜ ๊ฒฝ์šฐ Special Token์ด ์ œ๋Œ€๋กœ ๋ถ„๋ฆฌ๋˜์ง€ ์•Š๋Š” ์ด์Šˆ**๊ฐ€ ์žˆ์–ด์„œ ํ•ด๋‹น ๋ถ€๋ถ„์„ ์ˆ˜์ •ํ•˜์—ฌ ๋ฐ˜์˜ํ•˜์˜€์Šต๋‹ˆ๋‹ค. ([Issue link](https://github.com/SKTBrain/KoBERT/issues/11)) + +```python +>>> from tokenization_kobert import KoBertTokenizer +>>> tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert') # monologg/distilkobert๋„ ๋™์ผ +>>> tokenizer.tokenize("[CLS] ํ•œ๊ตญ์–ด ๋ชจ๋ธ์„ ๊ณต์œ ํ•ฉ๋‹ˆ๋‹ค. [SEP]") +>>> ['[CLS]', 'โ–ํ•œ๊ตญ', '์–ด', 'โ–๋ชจ๋ธ', '์„', 'โ–๊ณต์œ ', 'ํ•ฉ๋‹ˆ๋‹ค', '.', '[SEP]'] +>>> tokenizer.convert_tokens_to_ids(['[CLS]', 'โ–ํ•œ๊ตญ', '์–ด', 'โ–๋ชจ๋ธ', '์„', 'โ–๊ณต์œ ', 'ํ•ฉ๋‹ˆ๋‹ค', '.', '[SEP]']) +>>> [2, 4958, 6855, 2046, 7088, 1050, 7843, 54, 3] +``` + +## Kobert-Transformers (Pip library) + +[![PyPI](https://img.shields.io/pypi/v/kobert-transformers)](https://pypi.org/project/kobert-transformers/) +[![license](https://img.shields.io/badge/license-Apache%202.0-red)](https://github.com/monologg/DistilKoBERT/blob/master/LICENSE) +[![Downloads](https://pepy.tech/badge/kobert-transformers)](https://pepy.tech/project/kobert-transformers) + +- `tokenization_kobert.py`๋ฅผ ๋žฉํ•‘ํ•œ ํŒŒ์ด์ฌ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ +- KoBERT, DistilKoBERT๋ฅผ Huggingface Transformers ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ํ˜•ํƒœ๋กœ ์ œ๊ณต +- `v0.5.1`์—์„œ๋Š” `transformers v3.0` ์ด์ƒ์œผ๋กœ ๊ธฐ๋ณธ ์„ค์น˜ํ•ฉ๋‹ˆ๋‹ค. (`transformers v4.0` ๊นŒ์ง€๋Š” ์ด์Šˆ ์—†์ด ์‚ฌ์šฉ ๊ฐ€๋Šฅ) + +### Install Kobert-Transformers + +```bash +pip3 install kobert-transformers +``` + +### How to Use + +```python +>>> import torch +>>> from kobert_transformers import get_kobert_model, get_distilkobert_model +>>> model = get_kobert_model() +>>> model.eval() +>>> input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) +>>> attention_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) +>>> token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) +>>> sequence_output, pooled_output = model(input_ids, attention_mask, token_type_ids) +>>> sequence_output[0] +tensor([[-0.2461, 0.2428, 0.2590, ..., -0.4861, -0.0731, 0.0756], + [-0.2478, 0.2420, 0.2552, ..., -0.4877, -0.0727, 0.0754], + [-0.2472, 0.2420, 0.2561, ..., -0.4874, -0.0733, 0.0765]], + grad_fn=) +``` + +```python +>>> from kobert_transformers import get_tokenizer +>>> tokenizer = get_tokenizer() +>>> tokenizer.tokenize("[CLS] ํ•œ๊ตญ์–ด ๋ชจ๋ธ์„ ๊ณต์œ ํ•ฉ๋‹ˆ๋‹ค. [SEP]") +['[CLS]', 'โ–ํ•œ๊ตญ', '์–ด', 'โ–๋ชจ๋ธ', '์„', 'โ–๊ณต์œ ', 'ํ•ฉ๋‹ˆ๋‹ค', '.', '[SEP]'] +>>> tokenizer.convert_tokens_to_ids(['[CLS]', 'โ–ํ•œ๊ตญ', '์–ด', 'โ–๋ชจ๋ธ', '์„', 'โ–๊ณต์œ ', 'ํ•ฉ๋‹ˆ๋‹ค', '.', '[SEP]']) +[2, 4958, 6855, 2046, 7088, 1050, 7843, 54, 3] +``` + +## Reference + +- [KoBERT](https://github.com/SKTBrain/KoBERT) +- [DistilKoBERT](https://github.com/monologg/DistilKoBERT) +- [Huggingface Transformers](https://github.com/huggingface/transformers) + + + + +%package help +Summary: Development documents and examples for kobert-transformers +Provides: python3-kobert-transformers-doc +%description help +# KoBERT-Transformers + +`KoBERT` & `DistilKoBERT` on ๐Ÿค— Huggingface Transformers ๐Ÿค— + +KoBERT ๋ชจ๋ธ์€ [๊ณต์‹ ๋ ˆํฌ](https://github.com/SKTBrain/KoBERT)์˜ ๊ฒƒ๊ณผ ๋™์ผํ•ฉ๋‹ˆ๋‹ค. ๋ณธ ๋ ˆํฌ๋Š” **Huggingface tokenizer์˜ ๋ชจ๋“  API๋ฅผ ์ง€์›**ํ•˜๊ธฐ ์œ„ํ•ด์„œ ์ œ์ž‘๋˜์—ˆ์Šต๋‹ˆ๋‹ค. + +## ๐Ÿšจ ์ค‘์š”! ๐Ÿšจ + +### ๐Ÿ™ TL;DR + +1. `transformers` ๋Š” `v3.0` ์ด์ƒ์„ ๋ฐ˜๋“œ์‹œ ์„ค์น˜! +2. `tokenizer`๋Š” ๋ณธ ๋ ˆํฌ์˜ `kobert_transformers/tokenization_kobert.py`๋ฅผ ์‚ฌ์šฉ! + +### 1. Tokenizer ํ˜ธํ™˜ + +`Huggingface Transformers`๊ฐ€ `v2.9.0`๋ถ€ํ„ฐ tokenization ๊ด€๋ จ API๊ฐ€ ์ผ๋ถ€ ๋ณ€๊ฒฝ๋˜์—ˆ์Šต๋‹ˆ๋‹ค. ์ด์— ๋งž์ถฐ ๊ธฐ์กด์˜ `tokenization_kobert.py`๋ฅผ ์ƒ์œ„ ๋ฒ„์ „์— ๋งž๊ฒŒ ์ˆ˜์ •ํ•˜์˜€์Šต๋‹ˆ๋‹ค. + +### 2. Embedding์˜ padding_idx ์ด์Šˆ + +์ด์ „๋ถ€ํ„ฐ `BertModel`์˜ `BertEmbeddings`์—์„œ `padding_idx=0`์œผ๋กœ **Hard-coding**๋˜์–ด ์žˆ์—ˆ์Šต๋‹ˆ๋‹ค. (์•„๋ž˜ ์ฝ”๋“œ ์ฐธ๊ณ ) + +```python +class BertEmbeddings(nn.Module): + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) +``` + +๊ทธ๋Ÿฌ๋‚˜ Sentencepiece์˜ ๊ฒฝ์šฐ ๊ธฐ๋ณธ๊ฐ’์œผ๋กœ `pad_token_id=1`, `unk_token_id=0`์œผ๋กœ ์„ค์ •์ด ๋˜์–ด ์žˆ๊ณ  (์ด๋Š” KoBERT๋„ ๋™์ผ), ์ด๋ฅผ ๊ทธ๋Œ€๋กœ ์‚ฌ์šฉํ•˜๋Š” BertModel์˜ ๊ฒฝ์šฐ ์›์น˜ ์•Š์€ ๊ฒฐ๊ณผ๋ฅผ ๊ฐ€์ ธ์˜ฌ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค. + +Huggingface์—์„œ๋„ ์ตœ๊ทผ์— ํ•ด๋‹น ์ด์Šˆ๋ฅผ ์ธ์ง€ํ•˜์—ฌ ์ด๋ฅผ ์ˆ˜์ •ํ•˜์—ฌ `v2.9.0`์— ๋ฐ˜์˜ํ•˜์˜€์Šต๋‹ˆ๋‹ค. ([๊ด€๋ จ PR #3793](https://github.com/huggingface/transformers/pull/3793)) config์— `pad_token_id=1` ์„ ์ถ”๊ฐ€ ๊ฐ€๋Šฅํ•˜์—ฌ ์ด๋ฅผ ํ•ด๊ฒฐํ•  ์ˆ˜ ์žˆ๊ฒŒ ํ•˜์˜€์Šต๋‹ˆ๋‹ค. + +```python +class BertEmbeddings(nn.Module): + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) +``` + +๊ทธ๋Ÿฌ๋‚˜ `v.2.9.0`์—์„œ `DistilBERT`, `ALBERT` ๋“ฑ์—๋Š” ์ด ์ด์Šˆ๊ฐ€ ํ•ด๊ฒฐ๋˜์ง€ ์•Š์•„ ์ง์ ‘ PR์„ ์˜ฌ๋ ค ์ฒ˜๋ฆฌํ•˜์˜€๊ณ  ([๊ด€๋ จ PR #3965](https://github.com/huggingface/transformers/pull/3965)), **`v2.9.1`์— ์ตœ์ข…์ ์œผ๋กœ ๋ฐ˜์˜๋˜์–ด ๋ฐฐํฌ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.** + +์•„๋ž˜๋Š” ์ด์ „๊ณผ ํ˜„์žฌ ๋ฒ„์ „์˜ ์ฐจ์ด์ ์„ ๋ณด์—ฌ์ฃผ๋Š” ์ฝ”๋“œ์ž…๋‹ˆ๋‹ค. + +```python +# Transformers v2.7.0 +>>> from transformers import BertModel, DistilBertModel +>>> model = BertModel.from_pretrained("monologg/kobert") +>>> model.embeddings.word_embeddings +Embedding(8002, 768, padding_idx=0) +>>> model = DistilBertModel.from_pretrained("monologg/distilkobert") +>>> model.embeddings.word_embeddings +Embedding(8002, 768, padding_idx=0) + + +### Transformers v2.9.1 +>>> from transformers import BertModel, DistilBertModel +>>> model = BertModel.from_pretrained("monologg/kobert") +>>> model.embeddings.word_embeddings +Embedding(8002, 768, padding_idx=1) +>>> model = DistilBertModel.from_pretrained("monologg/distilkobert") +>>> model.embeddings.word_embeddings +Embedding(8002, 768, padding_idx=1) +``` + +## KoBERT / DistilKoBERT on ๐Ÿค— Transformers ๐Ÿค— + +### Dependencies + +- torch>=1.1.0 +- transformers>=3,<5 + +### How to Use + +```python +>>> from transformers import BertModel, DistilBertModel +>>> bert_model = BertModel.from_pretrained('monologg/kobert') +>>> distilbert_model = DistilBertModel.from_pretrained('monologg/distilkobert') +``` + +**Tokenizer๋ฅผ ์‚ฌ์šฉํ•˜๋ ค๋ฉด, [`kobert_transformers/tokenization_kobert.py`](https://github.com/monologg/KoBERT-Transformers/blob/master/kobert_transformers/tokenization_kobert.py) ํŒŒ์ผ์„ ๋ณต์‚ฌํ•œ ํ›„, `KoBertTokenizer`๋ฅผ ์ž„ํฌํŠธํ•˜๋ฉด ๋ฉ๋‹ˆ๋‹ค.** + +- KoBERT์™€ DistilKoBERT ๋ชจ๋‘ ๋™์ผํ•œ ํ† ํฌ๋‚˜์ด์ €๋ฅผ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค. +- **๊ธฐ์กด KoBERT์˜ ๊ฒฝ์šฐ Special Token์ด ์ œ๋Œ€๋กœ ๋ถ„๋ฆฌ๋˜์ง€ ์•Š๋Š” ์ด์Šˆ**๊ฐ€ ์žˆ์–ด์„œ ํ•ด๋‹น ๋ถ€๋ถ„์„ ์ˆ˜์ •ํ•˜์—ฌ ๋ฐ˜์˜ํ•˜์˜€์Šต๋‹ˆ๋‹ค. ([Issue link](https://github.com/SKTBrain/KoBERT/issues/11)) + +```python +>>> from tokenization_kobert import KoBertTokenizer +>>> tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert') # monologg/distilkobert๋„ ๋™์ผ +>>> tokenizer.tokenize("[CLS] ํ•œ๊ตญ์–ด ๋ชจ๋ธ์„ ๊ณต์œ ํ•ฉ๋‹ˆ๋‹ค. [SEP]") +>>> ['[CLS]', 'โ–ํ•œ๊ตญ', '์–ด', 'โ–๋ชจ๋ธ', '์„', 'โ–๊ณต์œ ', 'ํ•ฉ๋‹ˆ๋‹ค', '.', '[SEP]'] +>>> tokenizer.convert_tokens_to_ids(['[CLS]', 'โ–ํ•œ๊ตญ', '์–ด', 'โ–๋ชจ๋ธ', '์„', 'โ–๊ณต์œ ', 'ํ•ฉ๋‹ˆ๋‹ค', '.', '[SEP]']) +>>> [2, 4958, 6855, 2046, 7088, 1050, 7843, 54, 3] +``` + +## Kobert-Transformers (Pip library) + +[![PyPI](https://img.shields.io/pypi/v/kobert-transformers)](https://pypi.org/project/kobert-transformers/) +[![license](https://img.shields.io/badge/license-Apache%202.0-red)](https://github.com/monologg/DistilKoBERT/blob/master/LICENSE) +[![Downloads](https://pepy.tech/badge/kobert-transformers)](https://pepy.tech/project/kobert-transformers) + +- `tokenization_kobert.py`๋ฅผ ๋žฉํ•‘ํ•œ ํŒŒ์ด์ฌ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ +- KoBERT, DistilKoBERT๋ฅผ Huggingface Transformers ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ํ˜•ํƒœ๋กœ ์ œ๊ณต +- `v0.5.1`์—์„œ๋Š” `transformers v3.0` ์ด์ƒ์œผ๋กœ ๊ธฐ๋ณธ ์„ค์น˜ํ•ฉ๋‹ˆ๋‹ค. (`transformers v4.0` ๊นŒ์ง€๋Š” ์ด์Šˆ ์—†์ด ์‚ฌ์šฉ ๊ฐ€๋Šฅ) + +### Install Kobert-Transformers + +```bash +pip3 install kobert-transformers +``` + +### How to Use + +```python +>>> import torch +>>> from kobert_transformers import get_kobert_model, get_distilkobert_model +>>> model = get_kobert_model() +>>> model.eval() +>>> input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) +>>> attention_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) +>>> token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) +>>> sequence_output, pooled_output = model(input_ids, attention_mask, token_type_ids) +>>> sequence_output[0] +tensor([[-0.2461, 0.2428, 0.2590, ..., -0.4861, -0.0731, 0.0756], + [-0.2478, 0.2420, 0.2552, ..., -0.4877, -0.0727, 0.0754], + [-0.2472, 0.2420, 0.2561, ..., -0.4874, -0.0733, 0.0765]], + grad_fn=) +``` + +```python +>>> from kobert_transformers import get_tokenizer +>>> tokenizer = get_tokenizer() +>>> tokenizer.tokenize("[CLS] ํ•œ๊ตญ์–ด ๋ชจ๋ธ์„ ๊ณต์œ ํ•ฉ๋‹ˆ๋‹ค. [SEP]") +['[CLS]', 'โ–ํ•œ๊ตญ', '์–ด', 'โ–๋ชจ๋ธ', '์„', 'โ–๊ณต์œ ', 'ํ•ฉ๋‹ˆ๋‹ค', '.', '[SEP]'] +>>> tokenizer.convert_tokens_to_ids(['[CLS]', 'โ–ํ•œ๊ตญ', '์–ด', 'โ–๋ชจ๋ธ', '์„', 'โ–๊ณต์œ ', 'ํ•ฉ๋‹ˆ๋‹ค', '.', '[SEP]']) +[2, 4958, 6855, 2046, 7088, 1050, 7843, 54, 3] +``` + +## Reference + +- [KoBERT](https://github.com/SKTBrain/KoBERT) +- [DistilKoBERT](https://github.com/monologg/DistilKoBERT) +- [Huggingface Transformers](https://github.com/huggingface/transformers) + + + + +%prep +%autosetup -n kobert-transformers-0.5.1 + +%build +%py3_build + +%install +%py3_install +install -d -m755 %{buildroot}/%{_pkgdocdir} +if [ -d doc ]; then cp -arf doc %{buildroot}/%{_pkgdocdir}; fi +if [ -d docs ]; then cp -arf docs %{buildroot}/%{_pkgdocdir}; fi +if [ -d example ]; then cp -arf example %{buildroot}/%{_pkgdocdir}; fi +if [ -d examples ]; then cp -arf examples %{buildroot}/%{_pkgdocdir}; fi +pushd %{buildroot} +if [ -d usr/lib ]; then + find usr/lib -type f -printf "\"/%h/%f\"\n" >> filelist.lst +fi +if [ -d usr/lib64 ]; then + find usr/lib64 -type f -printf "\"/%h/%f\"\n" >> filelist.lst +fi +if [ -d usr/bin ]; then + find usr/bin -type f -printf "\"/%h/%f\"\n" >> filelist.lst +fi +if [ -d usr/sbin ]; then + find usr/sbin -type f -printf "\"/%h/%f\"\n" >> filelist.lst +fi +touch doclist.lst +if [ -d usr/share/man ]; then + find usr/share/man -type f -printf "\"/%h/%f.gz\"\n" >> doclist.lst +fi +popd +mv %{buildroot}/filelist.lst . +mv %{buildroot}/doclist.lst . + +%files -n python3-kobert-transformers -f filelist.lst +%dir %{python3_sitelib}/* + +%files help -f doclist.lst +%{_docdir}/* + +%changelog +* Tue Jun 20 2023 Python_Bot - 0.5.1-1 +- Package Spec generated diff --git a/sources b/sources new file mode 100644 index 0000000..bcd9b95 --- /dev/null +++ b/sources @@ -0,0 +1 @@ +33625e69e5551325dbe9edbcb7151136 kobert-transformers-0.5.1.tar.gz -- cgit v1.2.3