import functools
import pprint
import re
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
class AdvancedTextSummarizer:
def __init__(self, model_name=“sshleifer/distilbart-cnn-12-6”, quantize=False):
“”“Initialize the superior summarizer with extra options.
Args:
model_name (str): Identify of the pre-trained mannequin to make use of
quantize (bool): Whether or not to quantize the mannequin for sooner inference
““”
self.machine = “cuda” if torch.cuda.is_available() and not quantize else “cpu”
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.mannequin = AutoModelForSeq2SeqLM.from_pretrained(model_name)
if quantize:
self.mannequin = torch.quantization.quantize_dynamic(self.mannequin, dtype=torch.qint8)
self.mannequin.to(self.machine)
def preprocess_text(self, textual content: str) -> str:
“”“Clear and normalize enter textual content.
Args:
textual content (str): Uncooked enter textual content
Returns:
str: Cleaned and normalized textual content
““”
# Take away additional whitespace
textual content = re.sub(r“s+”, ” “, textual content.strip())
# Take away URLs
textual content = re.sub(r“^https?://[^s/$.?#].[^s]*$”, “”, textual content)
# Take away particular characters however maintain punctuation
textual content = re.sub(r“[^ws.,!?-]”, “”, textual content)
return textual content
def split_long_text(self, textual content: str, max_tokens: int = 1024) -> checklist[str]:
“”“Cut up lengthy textual content into chunks that match inside mannequin’s max token restrict.
Args:
textual content (str): Enter textual content
max_tokens (int): Most tokens per chunk
Returns:
checklist[str]: Record of textual content chunks
““”
# Tokenize the total textual content
tokens = self.tokenizer.tokenize(textual content)
# Cut up into chunks, then convert again to strings
chunks = [tokens[i:i+max_tokens] for i in vary(0, len(tokens), max_tokens)]
return [self.tokenizer.convert_tokens_to_string(chunk) for chunk in chunks]
@functools.lru_cache(maxsize=200)
def cached_summarize(self, textual content: str, max_length: int = 130, min_length: int = 30,
length_penalty: float = 2.0, repetition_penalty: float = 2.0,
num_beams: int = 4, early_stopping: bool = True) -> str:
“”“Cached model of the summarization operate.”“”
strive:
# Tokenize the enter textual content
inputs = self.tokenizer(textual content, max_length=1024, truncation=True,
padding=“max_length”, return_tensors=“pt”
).to(self.machine)
# Generate abstract
summary_ids = self.mannequin.generate(
inputs[“input_ids”],
attention_mask=inputs[“attention_mask”],
max_length=max_length,
min_length=min_length,
length_penalty=length_penalty,
repetition_penalty=repetition_penalty,
num_beams=num_beams,
early_stopping=early_stopping,
no_repeat_ngram_size=3, # Forestall repetition of phrases
)
abstract = self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
return abstract
besides Exception as e:
print(f“Error throughout summarization: {str(e)}”)
return textual content
def summarize_batch(self, texts: checklist[str], batch_size: int = 4, **kwargs) -> checklist[str]:
“”“Summarize a number of texts effectively in batches.
Args:
texts (checklist[str]): Record of enter texts
batch_size (int): Variety of texts to course of directly
**kwargs: Extra arguments for summarization
Returns:
checklist[str]: Record of generated summaries
““”
summaries = []
for i in vary(0, len(texts), batch_size):
# Create batch and course of every textual content within the batch
batch = texts[i:i + batch_size]
processed_batch = [self.preprocess_text(text) for text in batch]
# Tokenize batch
inputs = self.tokenizer(processed_batch, max_length=1024, truncation=True,
padding=True, return_tensors=“pt”
).to(self.machine)
# Generate summaries for batch
summary_ids = self.mannequin.generate(
inputs[“input_ids”],
attention_mask=inputs[“attention_mask”],
**kwargs
)
# Decode summaries
summaries.prolong([self.tokenizer.decode(ids, skip_special_tokens=True) for ids in summary_ids])
return summaries
def summarize(self, textual content: str, max_length: int = 130, min_length: int = 30,
length_penalty: float = 2.0, repetition_penalty: float = 2.0,
num_beams: int = 4, early_stopping: bool = True) -> dict[str, str]:
“”“Generate a abstract with superior options.
Args:
textual content (str): The textual content to summarize
max_length (int): Most size of the abstract
min_length (int): Minimal size of the abstract
length_penalty (float): Penalty for longer summaries
repetition_penalty (float): Penalty for repeated tokens
num_beams (int): Variety of beams for beam search
early_stopping (bool): Whether or not to cease when all beams are completed
Returns:
dict[str, str]: Dictionary containing authentic and summarized textual content
““”
# Preprocess the textual content
cleaned_text = self.preprocess_text(textual content)
# Deal with lengthy texts
chunks = self.split_long_text(cleaned_text)
chunk_summaries = []
for chunk in chunks:
abstract = self.cached_summarize(
chunk,
max_length=max_length // len(chunks), # Alter size for chunks
min_length=min_length // len(chunks),
length_penalty=length_penalty,
repetition_penalty=repetition_penalty,
num_beams=num_beams,
early_stopping=early_stopping
)
chunk_summaries.append(abstract)
return {
“original_text”: textual content,
“cleaned_text”: cleaned_text,
“abstract”: ” “.be a part of(chunk_summaries)
}
# Initialize the superior summarizer with caching enabled and quantization
adv_summarizer = AdvancedTextSummarizer(quantize=True)
# Pattern textual content
long_text = “”“
The event of synthetic intelligence (AI) has considerably impacted varied industries worldwide.
From healthcare to finance, AI-powered functions have streamlined operations, improved accuracy,
and unlocked new prospects. In healthcare, AI assists in diagnostics, personalised remedy plans,
and drug discovery. In finance, it aids in fraud detection, algorithmic buying and selling, and customer support.
Regardless of its advantages, AI raises issues about information privateness, moral implications, and job displacement.
““”
# Generate a abstract with default settings
adv_summary = adv_summarizer.summarize(long_text)
print(“Superior Abstract:”)
pprint.pprint(adv_summary)
# Batch summarization
texts = [
“AI is revolutionizing healthcare with better diagnostics and personalized treatments.”,
“Self-driving cars are powered by machine learning algorithms that continuously learn from traffic patterns.”,
“Natural language processing helps computers understand and communicate with humans more effectively.”,
“Climate change is being studied using AI models to predict future environmental patterns.”
]
# Summarize a number of texts in a batch
batch_summaries = adv_summarizer.summarize_batch(texts, batch_size=2)
print(“nBatch Summaries:”)
for i, s in enumerate(batch_summaries, 1):
pprint.pprint(f“{i}. {s}”)
Source link