| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| |
|
| | from pathlib import Path |
| | import json |
| | import tempfile |
| |
|
| | from transformers import PegasusTokenizer, PegasusTokenizerFast, PegasusConfig, PegasusForConditionalGeneration |
| | |
| |
|
| | mname_from = "google/pegasus-cnn_dailymail" |
| | mname_very_small = "pegasus-cnn_dailymail-tiny-random" |
| |
|
| | tokenizer = PegasusTokenizer.from_pretrained(mname_from) |
| | config = PegasusConfig.from_pretrained(mname_from) |
| | |
| |
|
| | |
| | import sys |
| | |
| | |
| | sys.path.append("./sentencepiece/python/src/sentencepiece") |
| | import sentencepiece_model_pb2 as model |
| |
|
| | tmp_dir = "/tmp/pegasus-tiny" |
| | tokenizer.save_pretrained(tmp_dir) |
| | file = tmp_dir + "/spiece.model" |
| | with open(file, 'rb') as f: data = f.read() |
| |
|
| | |
| | m = model.ModelProto() |
| | m.ParseFromString(data) |
| |
|
| | keep_items = 5000 |
| |
|
| | print("Shrinking vocab") |
| | print(f"original dict {len(m.pieces)}") |
| | for i in range(len(m.pieces)-keep_items): _ = m.pieces.pop() |
| | print(f"new dict {len(m.pieces)}") |
| |
|
| | with open(tmp_dir + "/spiece-short.model", 'wb') as f: |
| | f.write(m.SerializeToString()) |
| |
|
| | tokenizer = PegasusTokenizer(vocab_file=tmp_dir + "/spiece-short.model") |
| |
|
| | config.update(dict( |
| | vocab_size=keep_items+12, |
| | d_model=64, |
| | decoder_attention_heads=2, |
| | decoder_ffn_dim=64, |
| | decoder_layers=2, |
| | encoder_attention_heads=16, |
| | encoder_ffn_dim=64, |
| | encoder_layers=2, |
| | num_hidden_layers=2, |
| | )) |
| | print("new config", config) |
| |
|
| | very_small_model = PegasusForConditionalGeneration(config) |
| | print(f"num of params {very_small_model.num_parameters()}") |
| | very_small_model.resize_token_embeddings(len(tokenizer)) |
| |
|
| | |
| | src_texts = ["A long paragraph for summarization.", "Another paragraph for summarization."] |
| | tgt_texts = ["Summary of the text.", "Another summary."] |
| |
|
| | batch = tokenizer.prepare_seq2seq_batch(src_texts, tgt_texts, return_tensors="pt") |
| | outputs = very_small_model(**batch) |
| |
|
| | print("test output:", len(outputs.logits[0])) |
| |
|
| | |
| | very_small_model.half() |
| | very_small_model.save_pretrained(mname_very_small) |
| | config.save_pretrained(mname_very_small) |
| | tokenizer.save_pretrained(mname_very_small) |
| | |
| |
|
| | print(f"Generated {mname_very_small}") |
| |
|
| | |
| | |
| | |
| |
|