import numpy as np
import torch
from transformers import BertTokenizer, BertModel
def get_context_vectors(sentence, mannequin, tokenizer):
inputs = tokenizer(sentence, return_tensors=“pt”, add_special_tokens=True)
input_ids = inputs[“input_ids”]
attention_mask = inputs[“attention_mask”]
# Get the tokens (for reference)
tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
# Ahead go, get all hidden states from every layer
with torch.no_grad():
outputs = mannequin(input_ids, attention_mask=attention_mask, output_hidden_states=True)
hidden_states = outputs.hidden_states
# Every ingredient in hidden states has form (batch_size, sequence_length, hidden_size)
# Right here takes the primary ingredient within the batch from the final layer
last_layer_vectors = hidden_states[–1][0].numpy() # Form: (sequence_length, hidden_size)
return tokens, last_layer_vectors
def cosine_similarity(vec1, vec2):
return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
def extract_contextual_keywords(doc, mannequin, tokenizer, top_n=5):
“”“extract contextual key phrases from a doc”“”
# Cut up the doc into sentences (easy break up by interval)
sentences = [s.strip() for s in document.split(“.”) if s.strip()]
# Course of every sentence to get context vectors
all_tokens = []
all_vectors = []
for sentence in sentences:
if not sentence:
proceed # Skip empty sentences
# Get context vectors
tokens, vectors = get_context_vectors(sentence, mannequin, tokenizer)
# Retailer tokens and vectors (excluding particular tokens [CLS] and [SEP])
all_tokens.lengthen(tokens[1:–1])
all_vectors.lengthen(vectors[1:–1])
# Convert to numpy arrays, then calculate the doc vector as common of all token vectors
all_vectors = np.array(all_vectors)
doc_vector = np.imply(all_vectors, axis=0)
# Calculate similarity between every token vector and the doc vector
similarities = []
for token, vec in zip(all_tokens, all_vectors):
# Skip particular tokens, punctuation, and customary phrases
if token in [“[CLS]”, “[SEP]”, “.”, “,”, “!”, “?”, “the”, “a”, “an”, “is”, “are”, “was”, “have been”]:
proceed
# compute similarity, then keep in mind it with the token
sim = cosine_similarity(vec, doc_vector)
similarities.append((sim, token))
# Type the similarity and get the highest N
top_similarities = sorted(similarities, reverse=True)[:top_n]
return prime_similarities
# Instance doc
doc = “”“
Synthetic intelligence is remodeling industries all over the world.
Machine studying algorithms can analyze huge quantities of knowledge to establish patterns and make predictions.
Pure language processing permits computer systems to grasp and generate human language.
Pc imaginative and prescient techniques can acknowledge objects and interpret visible info.
These applied sciences are driving innovation in healthcare, finance, transportation, and plenty of different sectors.
““”
tokenizer = BertTokenizer.from_pretrained(“bert-base-uncased”)
mannequin = BertModel.from_pretrained(“bert-base-uncased”)
mannequin.eval()
# Extract contextual key phrases and print the consequence
top_keywords = extract_contextual_keywords(doc, mannequin, tokenizer, top_n=10)
print(“Prime contextual key phrases:”)
for similarity, token in top_keywords:
print(f“{token}: {similarity:.4f}”)
Source link