from functools import lru_cache
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
class AutoComplete:
def __init__(self, model_name=“gpt2”):
“”“Initialize the auto-complete system.”“”
self.tokenizer = GPT2Tokenizer.from_pretrained(model_name, padding_side=“left”)
self.mannequin = GPT2LMHeadModel.from_pretrained(model_name)
self.machine = “cuda” if torch.cuda.is_available() else “cpu”
self.mannequin.to(self.machine)
self.mannequin.eval() # Set to analysis mode
def get_completion(self, textual content, max_length=50):
“”“Generate completion for the enter textual content.”“”
print(“**** Completion:”, textual content)
# Encode the enter textual content
inputs = self.tokenizer(textual content, add_special_tokens=False, return_tensors=“pt”)
input_ids = inputs[“input_ids”].to(self.machine)
attn_masks = inputs[“attention_mask”].to(self.machine)
# Generate completion
with torch.no_grad():
outputs = self.mannequin.generate(
input_ids,
attention_mask=attn_masks,
max_length=max_length,
num_return_sequences=1,
pad_token_id=self.tokenizer.eos_token_id,
do_sample=True,
temperature=0.7
)
# Decode and extract completion
full_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
completion = full_text[len(text):]
return completion
class CachedAutoComplete(AutoComplete):
def __init__(self, cache_size=1000, **kwargs):
“”“Initialize with caching assist.”“”
tremendous().__init__(**kwargs)
self.get_completion = lru_cache(maxsize=cache_size)(
self.get_completion
)
class OptimizedAutoComplete(CachedAutoComplete):
def __init__(self, **kwargs):
“”“Initialize with optimizations.”“”
tremendous().__init__(**kwargs)
self.tokenizer.pad_token = self.tokenizer.eos_token
if self.machine == “cuda”:
self.mannequin = self.mannequin.half() # Use FP16 on GPU
# use eval mode and cuda graphs
self.mannequin.eval()
def preprocess_batch(self, texts):
“”“Effectively course of a number of texts.”“”
# Tokenize all texts without delay
inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors=“pt”)
return inputs.to(self.machine)
def generate_batch(self, texts, max_length=50):
“”“Generate completions for a number of texts.”“”
# Preprocess batch
inputs = self.preprocess_batch(texts)
# Generate completions
with torch.no_grad():
outputs = self.mannequin.generate(
inputs[“input_ids”],
attention_mask=inputs[“attention_mask”],
max_length=max_length,
num_return_sequences=1,
pad_token_id=self.tokenizer.eos_token_id,
do_sample=True,
temperature=0.7
)
# Decode completions
completions = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
# Extract new textual content
outcomes = []
for textual content, completion in zip(texts, completions):
outcomes.append(completion[len(text):])
return outcomes
# Instance: Optimized batch completion
optimized_complete = OptimizedAutoComplete()
texts = [
“Machine learning is”,
“Deep neural networks can”,
“The training process involves”
]
completions = optimized_complete.generate_batch(texts)
for textual content, completion in zip(texts, completions):
print(f“nInput: {textual content}”)
print(f“Completion: {completion}”)
Source link