Text Summarization for World History Encyclopedia(a website)
Practical approaches addressing text summarization from World History Encyclopedia website.
!pip install rouge_score
from rouge_score import rouge_scorer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import tokenize
import matplotlib.pyplot as plt
import html
import re
import random
import rouge_score
Categories of common text summarization tasks:
- Extractive vs Abstractive summarization
- Query-focused vs query-independent summarization
- Single-document vs query-document summarization
Most common case:
- Single document, query-independent, extractive summarizer.
Limitation:
- May need to be customized into your own use cases
- ROUGE method as a metric to evaluation summarization may also need to be customized for research prorblem.
- Summarization is sensitive to the size of the text given as input. A better approach would be run text summarization separately on different part of texts.
Incentive:
Although being a well-rounded person is not an easy task, gaining more general knowledge outside of our professionals especially knowledge from world history sometimes could give us another perspective to perceive our world or have more empathy toward environments and people. By far the most effective way to be well-rounded person is by reading more books from a variety of genres. However, people nowadays can be distracted by their personal hardships and relatioships with others and therefore lack of time to read every book from beginning to end. As a result, we provide a life saver in this situation for anyone wants to learn some history from website World History Encyclopedia. In this notebook, we are going to further summarize these already abbreviated history articles from this website. This summarization will not only save your time from truely read the whole artilce but also quickly help you understand the whole articles (if you have time to read) by reading this summarization first.
In this notebook, we focus on extractive methods in text summarization.
import requests
from bs4 import BeautifulSoup
import os.path
from dateutil import parser
import pandas as pd
import numpy as np
import re
import os
BASE_DIR = '/content'
def download_article(url):
# check if article already there
filename = url.split('/')[-2] + ".html"
os.makedirs('world_history_encyclopedia', exist_ok=True)
filename = f"{BASE_DIR}/world_history_encyclopedia/" + filename
if not os.path.isfile(filename):
r = requests.get(url)
with open(filename, "w+") as f:
f.write(r.text)
return filename
def clean_article(soup_source):
r = re.compile("(Sign up|news letter|\n)")
texts = ""
for t in soup_source.select('p'):
if 'World History Encyclopedia' in t.text:
break
else:
if not r.match(t.text):
texts += t.text + ' '
return texts
def parse_article(article_file):
with open(article_file, "r") as f:
html = f.read()
r = {}
soup = BeautifulSoup(html, 'html.parser')
r['headline'] = soup.h1.text
r['first_paragraph'] = soup.p.text
r['text'] = clean_article(soup)
return r
import reprlib
r = reprlib.Repr()
r.maxstring = 800
url1 = "https://www.worldhistory.org/article/1596/mulan-the-legend-through-history/"
article_name1 = download_article(url1)
article1 = parse_article(article_name1)
print (r.repr(article1['text']))
Topic representation methods distinguish important sentences by identifying topics of sentences through important words.
In this simple technique, we sum over the tf-idf vectors of each sentence to determine if we include this setence to be our part of our summaries.
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import tokenize
import nltk
nltk.download('punkt')
sentences = tokenize.sent_tokenize(article1['text'])
tfidfVectorizer = TfidfVectorizer()
words_tfidf = tfidfVectorizer.fit_transform(sentences)
num_summary_sentence = 10
# Sort the sentences in descending order by the sum of TF-IDF values
sent_sum = words_tfidf.sum(axis=1)
important_sent = np.argsort(sent_sum, axis=0)[::-1]
# Print three most important sentences in the order they appear in the article
for i in range(0, len(sentences)):
if i in important_sent[:num_summary_sentence]:
print (sentences[i])
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import tokenize
def tfidf_summary(text, num_summary_sentence):
summary_sentence = []
sentences = tokenize.sent_tokenize(text)
tfidfVectorizer = TfidfVectorizer()
words_tfidf = tfidfVectorizer.fit_transform(sentences)
sentence_sum = words_tfidf.sum(axis=1)
important_sentences = np.argsort(sentence_sum, axis=0)[::-1]
for i in range(0, len(sentences)):
if i in important_sentences[:num_summary_sentence]:
summary_sentence.append(sentences[i])
return summary_sentence
print("Tf-IDF method:")
tfidf_summary(article1['text'], 10)
LSA essentiallu perform SVD technique from linear algebra to simply the original term frequency sentence matrix into matrices with essence of the article.
!pip install sumy
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
from sumy.summarizers.lsa import LsaSummarizer
LANGUAGE = "english"
stemmer = Stemmer(LANGUAGE)
parser = PlaintextParser.from_string(article1['text'], Tokenizer(LANGUAGE))
summarizer = LsaSummarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)
for sentence in summarizer(parser.document, num_summary_sentence):
print (str(sentence))
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
from sumy.summarizers.lsa import LsaSummarizer
def lsa_summary(text, num_summary_sentence):
summary_sentence = []
LANGUAGE = "english"
stemmer = Stemmer(LANGUAGE)
parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
summarizer = LsaSummarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)
for sentence in summarizer(parser.document, num_summary_sentence):
summary_sentence.append(str(sentence))
return summary_sentence
print("LSA Method:")
lsa_summary(article1['text'], 10)
Basically, indicator representation technique create intermediate featuress between sentences to take into account their relationship instead of using only words in each sentence.
TextRank technique is inspired by Google's graph-based ranking algorithm. In the case of natural language text, the author of this technique build a graph associated with the text. Original paper can be found here
from sumy.summarizers.text_rank import TextRankSummarizer
parser = PlaintextParser.from_string(article1['text'], Tokenizer(LANGUAGE))
summarizer = TextRankSummarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)
for sentence in summarizer(parser.document, num_summary_sentence):
print (str(sentence))
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
from sumy.summarizers.text_rank import TextRankSummarizer
def textrank_summary(text, num_summary_sentence):
summary_sentence = []
LANGUAGE = "english"
stemmer = Stemmer(LANGUAGE)
parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
summarizer = TextRankSummarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)
for sentence in summarizer(parser.document, num_summary_sentence):
summary_sentence.append(str(sentence))
return summary_sentence
textrank_summary(article1['text'], 10)
ROUGE - Recall-Oriented Understudy for Gisting Evaluation, is a popular metrics for measuring the accuracy when dealing with langugage summarization or translation. Although ROUGE method can only measure syntactical matches rather than semantic similarities between words, it is still a very good tool at hand to handle summarization and machine translation tasks.
def print_rouge_score(rouge_score):
for k,v in rouge_score.items():
print (k, 'Precision:', "{:.2f}".format(v.precision), 'Recall:', "{:.2f}".format(v.recall), 'fmeasure:', "{:.2f}".format(v.fmeasure))
Since we don't have human-generated summaries of these World History Encyclopedia articles, model evaluation below, we will use the first paragraph as the gist of each of articles from world history encyclopedia.
class TextSummarization():
def __init__(self, url, alg, num_sentences, n_gram):
assert isinstance(alg, list) == True, "Candidate algorithms are not list type"
self.article = parse_article(download_article(url))
self.alg_list = alg
self.ngram = n_gram
self.num_sentences = num_summary_sentence
self.standard = self.article['first_paragraph']
def rouge_score(self, n_gram, raw_summary, best=False):
# rouge score can also be measured based on different length of ngrams
scorer = rouge_scorer.RougeScorer([f'rouge{self.ngram}'], use_stemmer=True)
scores = scorer.score(self.standard, raw_summary)
if best:
_, _, fmeasure = scores[f'rouge{self.ngram}']
return fmeasure
else:
return scores
def raw_summary(self):
summaries = [(''.join(alg(self.article['text'], self.num_sentences)), alg) for alg in self.alg_list]
return summaries
def print_summary_detail(self):
names = []
scores = []
summaries = []
for summary, alg in self.raw_summary():
names.append(alg.__name__)
scores.append(self.rouge_score(self.ngram, summary).items())
summaries.append(alg(self.article['text'], self.num_sentences))
# print()
print("Detail:")
return {n:[sc, su] for n, sc, su in zip(names, scores, summaries)}
def best_summary(self):
fmeasures = [self.rouge_score(self.ngram, summary, best=True) for summary, alg in self.raw_summary()]
best_alg = self.alg_list[fmeasures.index(max(fmeasures))]
print(*best_alg(self.article['text'], self.num_sentences), sep='\n')
print()
curl = 'https://www.worldhistory.org/article/1596/mulan-the-legend-through-history/'
summaries = TextSummarization(url, [tfidf_summary, lsa_summary, textrank_summary], 10, 2)
summaries.best_summary()
summaries.print_summary_detail()
url = 'https://www.worldhistory.org/Genghis_Khan/'
summaries = TextSummarization(url, [tfidf_summary, lsa_summary, textrank_summary], 10, 2)
summaries.best_summary()
summaries.print_summary_detail()
url = 'https://www.worldhistory.org/article/1920/the-iberian-conquest-of-the-americas/'
summaries = TextSummarization(url, [tfidf_summary, lsa_summary, textrank_summary], 10, 2)
summaries.best_summary()
summaries.print_summary_detail()
url = 'https://www.worldhistory.org/article/1893/christmas-through-the-ages/'
summaries = TextSummarization(url, [tfidf_summary, lsa_summary, textrank_summary], 10, 2)
summaries.best_summary()
summaries.print_summary_detail()
According to our experiments, not any single one of models can be the best model for any article. Therefore, it is worthwhile to test a bunch of different summarizer algorithms to choose the best one for our application.