Text Summarization with DistillBart Model

Advertise here

import functools

import pprint

import re

import torch

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

class AdvancedTextSummarizer:

def __init__(self, model_name=“sshleifer/distilbart-cnn-12-6”, quantize=False):

“”“Initialize the superior summarizer with extra options.

Args:

model_name (str): Identify of the pre-trained mannequin to make use of

quantize (bool): Whether or not to quantize the mannequin for sooner inference

““”

self.machine = “cuda” if torch.cuda.is_available() and not quantize else “cpu”

self.tokenizer = AutoTokenizer.from_pretrained(model_name)

self.mannequin = AutoModelForSeq2SeqLM.from_pretrained(model_name)

if quantize:

self.mannequin = torch.quantization.quantize_dynamic(self.mannequin, dtype=torch.qint8)

self.mannequin.to(self.machine)

def preprocess_text(self, textual content: str) -> str:

“”“Clear and normalize enter textual content.

Args:

textual content (str): Uncooked enter textual content

Returns:

str: Cleaned and normalized textual content

““”

# Take away additional whitespace

textual content = re.sub(r“s+”, ” “, textual content.strip())

# Take away URLs

textual content = re.sub(r“^https?://[^s/$.?#].[^s]*$”, “”, textual content)

# Take away particular characters however maintain punctuation

textual content = re.sub(r“[^ws.,!?-]”, “”, textual content)

return textual content

def split_long_text(self, textual content: str, max_tokens: int = 1024) -> checklist[str]:

“”“Cut up lengthy textual content into chunks that match inside mannequin’s max token restrict.

Args:

textual content (str): Enter textual content

max_tokens (int): Most tokens per chunk

Returns:

checklist[str]: Record of textual content chunks

““”

# Tokenize the total textual content

tokens = self.tokenizer.tokenize(textual content)

# Cut up into chunks, then convert again to strings

chunks = [tokens[i:i+max_tokens] for i in vary(0, len(tokens), max_tokens)]

return [self.tokenizer.convert_tokens_to_string(chunk) for chunk in chunks]

@functools.lru_cache(maxsize=200)

def cached_summarize(self, textual content: str, max_length: int = 130, min_length: int = 30,

length_penalty: float = 2.0, repetition_penalty: float = 2.0,

num_beams: int = 4, early_stopping: bool = True) -> str:

“”“Cached model of the summarization operate.”“”

strive:

# Tokenize the enter textual content

inputs = self.tokenizer(textual content, max_length=1024, truncation=True,

padding=“max_length”, return_tensors=“pt”

).to(self.machine)

# Generate abstract

summary_ids = self.mannequin.generate(

inputs[“input_ids”],

attention_mask=inputs[“attention_mask”],

max_length=max_length,

min_length=min_length,

length_penalty=length_penalty,

repetition_penalty=repetition_penalty,

num_beams=num_beams,

early_stopping=early_stopping,

no_repeat_ngram_size=3, # Forestall repetition of phrases

)

abstract = self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)

return abstract

besides Exception as e:

print(f“Error throughout summarization: {str(e)}”)

return textual content

def summarize_batch(self, texts: checklist[str], batch_size: int = 4, **kwargs) -> checklist[str]:

“”“Summarize a number of texts effectively in batches.

Args:

texts (checklist[str]): Record of enter texts

batch_size (int): Variety of texts to course of directly

**kwargs: Extra arguments for summarization

Returns:

checklist[str]: Record of generated summaries

““”

summaries = []

for i in vary(0, len(texts), batch_size):

# Create batch and course of every textual content within the batch

batch = texts[i:i + batch_size]

processed_batch = [self.preprocess_text(text) for text in batch]

# Tokenize batch

inputs = self.tokenizer(processed_batch, max_length=1024, truncation=True,

padding=True, return_tensors=“pt”

).to(self.machine)

# Generate summaries for batch

summary_ids = self.mannequin.generate(

inputs[“input_ids”],

attention_mask=inputs[“attention_mask”],

**kwargs

)

# Decode summaries

summaries.prolong([self.tokenizer.decode(ids, skip_special_tokens=True) for ids in summary_ids])

return summaries

def summarize(self, textual content: str, max_length: int = 130, min_length: int = 30,

length_penalty: float = 2.0, repetition_penalty: float = 2.0,

num_beams: int = 4, early_stopping: bool = True) -> dict[str, str]:

“”“Generate a abstract with superior options.

Args:

textual content (str): The textual content to summarize

max_length (int): Most size of the abstract

min_length (int): Minimal size of the abstract

length_penalty (float): Penalty for longer summaries

repetition_penalty (float): Penalty for repeated tokens

num_beams (int): Variety of beams for beam search

early_stopping (bool): Whether or not to cease when all beams are completed

Returns:

dict[str, str]: Dictionary containing authentic and summarized textual content

““”

# Preprocess the textual content

cleaned_text = self.preprocess_text(textual content)

# Deal with lengthy texts

chunks = self.split_long_text(cleaned_text)

chunk_summaries = []

for chunk in chunks:

abstract = self.cached_summarize(

chunk,

max_length=max_length // len(chunks), # Alter size for chunks

min_length=min_length // len(chunks),

length_penalty=length_penalty,

repetition_penalty=repetition_penalty,

num_beams=num_beams,

early_stopping=early_stopping

)

chunk_summaries.append(abstract)

return {

“original_text”: textual content,

“cleaned_text”: cleaned_text,

“abstract”: ” “.be a part of(chunk_summaries)

}

# Initialize the superior summarizer with caching enabled and quantization

adv_summarizer = AdvancedTextSummarizer(quantize=True)

# Pattern textual content

long_text = “”“

The event of synthetic intelligence (AI) has considerably impacted varied industries worldwide.

From healthcare to finance, AI-powered functions have streamlined operations, improved accuracy,

and unlocked new prospects. In healthcare, AI assists in diagnostics, personalised remedy plans,

and drug discovery. In finance, it aids in fraud detection, algorithmic buying and selling, and customer support.

Regardless of its advantages, AI raises issues about information privateness, moral implications, and job displacement.

““”

# Generate a abstract with default settings

adv_summary = adv_summarizer.summarize(long_text)

print(“Superior Abstract:”)

pprint.pprint(adv_summary)

# Batch summarization

texts = [

“AI is revolutionizing healthcare with better diagnostics and personalized treatments.”,

“Self-driving cars are powered by machine learning algorithms that continuously learn from traffic patterns.”,

“Natural language processing helps computers understand and communicate with humans more effectively.”,

“Climate change is being studied using AI models to predict future environmental patterns.”

]

# Summarize a number of texts in a batch

batch_summaries = adv_summarizer.summarize_batch(texts, batch_size=2)

print(“nBatch Summaries:”)

for i, s in enumerate(batch_summaries, 1):

pprint.pprint(f“{i}. {s}”)

Advertise here

Source link

Text Summarization with DistillBart Model

Of all the things to happen to music this century, this is the most important — and worst

India says quashing Volkswagen’s $1.4 billion tax bill would be ‘catastrophic’

These '80s Movie Characters Used To Be Instantly Recognizable — Can You Name 10 Of Them Today?

Scams are everywhere. In our season finale, we’re fighting back: CBC’s Marketplace cheat sheet

EU retaliates with new tariffs on U.S. industrial and farm products – National

Ripple and SEC Near Resolution in 4-Year Lawsuit Over $125 Million XRP Sales Penalty

Pakistan Is Trying to Integrate the ‘Most Dangerous Place’ on Earth. It’s Failing.

Euro hits 4-month peak, dollar soggy on German spending and tariff reprieve

Oscars 2025 winners list: Musical movies face off for top honours – National

Text Summarization with DistillBart Model

Related Posts