reflector/trials/title_summary/bert.py

import torch
from transformers import BertTokenizer, BertModel
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load the pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
model = BertModel.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)

# Set the device to use
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load the SentenceTransformer model
sentence_transformer_model = SentenceTransformer('average_word_embeddings_glove.6B.300d')

# Define the input text
text = "Your input text to be summarized goes here."

# Tokenize the text
tokens = tokenizer.tokenize(text)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor([input_ids]).to(device)

# Get the BERT model output
with torch.no_grad():
    outputs = model(input_ids)[0]  # Extract the last hidden states

# Calculate sentence embeddings
sentence_embeddings = outputs.mean(dim=1).squeeze().cpu().numpy()
input_text_embedding = sentence_transformer_model.encode([text])[0]

# Calculate cosine similarity between sentences and input text
similarity_scores = cosine_similarity([input_text_embedding], sentence_embeddings)

# Sort the sentences by similarity scores in descending order
sorted_sentences = [sent for _, sent in sorted(zip(similarity_scores[0], sentences), reverse=True)]

# Choose the top sentences as the summary
num_summary_sentences = 2  # Adjust as needed
summary = ". ".join(sorted_sentences[:num_summary_sentences])
print("Summary:", summary)