import json
# JSON file
f = open ('AIDA-YAGO2_2_GATE.json', "r")
# Reading from file
data = json.loads(f.read())
txt = data['text']
txt
'China says Taiwan spoils atmosphere for talks . \nBEIJING 1996-08-22 \nChina on Thursday accused Taipei of spoiling the atmosphere for a resumption of talks across the Taiwan Strait with a visit to Ukraine by Taiwanese Vice President Lien Chan this week that infuriated Beijing . \nSpeaking only hours after Chinese state media said the time was right to engage in political talks with Taiwan , Foreign Ministry spokesman Shen Guofang told Reuters : " The necessary atmosphere for the opening of the talks has been disrupted by the Taiwan authorities . " \nState media quoted China \'s top negotiator with Taipei , Tang Shubei , as telling a visiting group from Taiwan on Wednesday that it was time for the rivals to hold political talks . \n" Now is the time for the two sides to engage in political talks ... \nthat is to end the state of hostility , " Thursday \'s overseas edition of the People \'s Daily quoted Tang as saying . \nThe foreign ministry \'s Shen told Reuters Television in an interview he had read reports of Tang \'s comments but gave no details of why the negotiator had considered the time right for talks with Taiwan , which Beijing considers a renegade province . \nChina , which has long opposed all Taipei efforts to gain greater international recognition , was infuriated by a visit to Ukraine this week by Taiwanese Vice President Lien . '
import spacy
from spacy.cli import download as spacy_download
spacy_model = 'en_core_web_sm'
spacy_download(spacy_model)
ner = spacy.load(spacy_model)
✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')
# Perform NER
doc_spacy = ner(txt)
from spacy import displacy
displacy.render(doc_spacy, style='ent', jupyter=True)
from gatenlp import Document
# Open and display the golden standard
with open('AIDA-YAGO2_2_GATE.json', 'r') as inp:
gold_dict = json.load(inp)
gold_annotations_doc = Document.from_dict(gold_dict)
gold_annotations_doc
doc2 = Document(txt)
annset_ner = doc2.annset('spacy')
# add annotations from spacy to gatenlp
for ann in doc_spacy.ents:
annset_ner.add(ann.start_char, ann.end_char, ann.label_) #add the annotations to GateNLP Document
# Display the performed NER
eval_annotations_doc = Document.from_dict(doc2.to_dict())
eval_annotations_doc
gold_annset = gold_annotations_doc.annset('gold')
to_eval_annset = eval_annotations_doc.annset('spacy')
true_positive = 0
false_positive = 0
for ann in to_eval_annset:
gold_ann_matched = list(gold_annset.within((ann.start, ann.end)))
if len(gold_ann_matched) == 1 and ann.start == gold_ann_matched[0].start and ann.end == gold_ann_matched[0].end:
true_positive += 1
else:
false_positive += 1
exact_precision = true_positive / (true_positive + false_positive)
exact_precision
0.7317073170731707
true_positive = 0
false_negative = 0
for gold_ann in gold_annset:
to_eval_ann_matched = list(to_eval_annset.within((gold_ann.start, gold_ann.end)))
if len(to_eval_ann_matched) == 1 and gold_ann.start == to_eval_ann_matched[0].start and gold_ann.end == to_eval_ann_matched[0].end:
true_positive += 1
else:
false_negative += 1
exact_recall = true_positive / (true_positive + false_negative)
exact_recall
0.9375
true_positive = 0
false_positive = 0
for ann in to_eval_annset:
gold_ann_matched = list(gold_annset.within((ann.start, ann.end)))
if len(gold_ann_matched) == 1 and ann.end == gold_ann_matched[0].end:
true_positive += 1
else:
false_positive += 1
right_precision = true_positive / (true_positive + false_positive)
right_precision
0.7804878048780488
true_positive = 0
false_negative = 0
for gold_ann in gold_annset:
to_eval_ann_matched = list(to_eval_annset.within((gold_ann.start, gold_ann.end)))
if len(to_eval_ann_matched) == 1 and gold_ann.end == to_eval_ann_matched[0].end:
true_positive += 1
else:
false_negative += 1
right_recall = true_positive / (true_positive + false_negative)
right_recall
0.9375
true_positive = 0
false_positive = 0
for ann in to_eval_annset:
gold_ann_matched = list(gold_annset.within((ann.start, ann.end)))
if len(gold_ann_matched) != 1:
false_positive += 1
continue
# Build two lists that contain the numbers corresponding to the entity position, bot for the golden standard and our NER
ListG = [*range(gold_ann_matched[0].start, gold_ann_matched[0].end)]
ListE = [*range(ann.start, ann.end)]
# If the two lists share at leas a number then NER found at least a fragment of the gold annotation
if any(item in ListE for item in ListG):
true_positive += 1
else:
false_positive += 1
partial_precision = true_positive / (true_positive + false_positive)
partial_precision
0.7804878048780488
true_positive = 0
false_negative = 0
for gold_ann in gold_annset:
to_eval_ann_matched = list(to_eval_annset.within((gold_ann.start, gold_ann.end)))
ListG = [*range(gold_ann.start, gold_ann.end)]
if len(to_eval_ann_matched) != 1:
false_negative += 1
continue
ListE = [*range(to_eval_ann_matched[0].start, to_eval_ann_matched[0].end)]
if any(item in ListE for item in ListG):
true_positive += 1
else:
false_negative += 1
partial_recall = true_positive / (true_positive + false_negative)
partial_recall
0.9375
print('------------------ GOLD ANNOTATION SET -----------------')
for ann in gold_annset:
print({'start':ann.start, 'end':ann.end, 'type':ann.type, 'id':ann.id, 'mention':txt[ann.start:ann.end]})
print('------------------ SPACY ANNOTATION SET -----------------')
for ann in to_eval_annset:
print({'start':ann.start, 'end':ann.end, 'type':ann.type, 'id':ann.id, 'mention':txt[ann.start:ann.end]})
------------------ GOLD ANNOTATION SET ----------------- {'start': 0, 'end': 5, 'type': 'LOC', 'id': 0, 'mention': 'China'} {'start': 11, 'end': 17, 'type': 'LOC', 'id': 1, 'mention': 'Taiwan'} {'start': 49, 'end': 56, 'type': 'LOC', 'id': 2, 'mention': 'BEIJING'} {'start': 69, 'end': 74, 'type': 'LOC', 'id': 3, 'mention': 'China'} {'start': 95, 'end': 101, 'type': 'LOC', 'id': 4, 'mention': 'Taipei'} {'start': 166, 'end': 179, 'type': 'LOC', 'id': 5, 'mention': 'Taiwan Strait'} {'start': 196, 'end': 203, 'type': 'LOC', 'id': 6, 'mention': 'Ukraine'} {'start': 207, 'end': 216, 'type': 'MISC', 'id': 7, 'mention': 'Taiwanese'} {'start': 232, 'end': 241, 'type': 'PER', 'id': 8, 'mention': 'Lien Chan'} {'start': 268, 'end': 275, 'type': 'LOC', 'id': 9, 'mention': 'Beijing'} {'start': 305, 'end': 312, 'type': 'MISC', 'id': 10, 'mention': 'Chinese'} {'start': 383, 'end': 389, 'type': 'LOC', 'id': 11, 'mention': 'Taiwan'} {'start': 392, 'end': 408, 'type': 'ORG', 'id': 12, 'mention': 'Foreign Ministry'} {'start': 419, 'end': 431, 'type': 'PER', 'id': 13, 'mention': 'Shen Guofang'} {'start': 437, 'end': 444, 'type': 'ORG', 'id': 14, 'mention': 'Reuters'} {'start': 529, 'end': 535, 'type': 'LOC', 'id': 15, 'mention': 'Taiwan'} {'start': 572, 'end': 577, 'type': 'LOC', 'id': 16, 'mention': 'China'} {'start': 601, 'end': 607, 'type': 'LOC', 'id': 17, 'mention': 'Taipei'} {'start': 610, 'end': 621, 'type': 'PER', 'id': 18, 'mention': 'Tang Shubei'} {'start': 657, 'end': 663, 'type': 'LOC', 'id': 19, 'mention': 'Taiwan'} {'start': 884, 'end': 899, 'type': 'ORG', 'id': 20, 'mention': "People 's Daily"} {'start': 907, 'end': 911, 'type': 'PER', 'id': 21, 'mention': 'Tang'} {'start': 949, 'end': 953, 'type': 'ORG', 'id': 22, 'mention': 'Shen'} {'start': 959, 'end': 977, 'type': 'ORG', 'id': 23, 'mention': 'Reuters Television'} {'start': 1017, 'end': 1021, 'type': 'PER', 'id': 24, 'mention': 'Tang'} {'start': 1121, 'end': 1127, 'type': 'LOC', 'id': 25, 'mention': 'Taiwan'} {'start': 1136, 'end': 1143, 'type': 'LOC', 'id': 26, 'mention': 'Beijing'} {'start': 1177, 'end': 1182, 'type': 'LOC', 'id': 27, 'mention': 'China'} {'start': 1212, 'end': 1218, 'type': 'LOC', 'id': 28, 'mention': 'Taipei'} {'start': 1300, 'end': 1307, 'type': 'LOC', 'id': 29, 'mention': 'Ukraine'} {'start': 1321, 'end': 1330, 'type': 'MISC', 'id': 30, 'mention': 'Taiwanese'} {'start': 1346, 'end': 1350, 'type': 'PER', 'id': 31, 'mention': 'Lien'} ------------------ SPACY ANNOTATION SET ----------------- {'start': 0, 'end': 5, 'type': 'GPE', 'id': 0, 'mention': 'China'} {'start': 11, 'end': 17, 'type': 'GPE', 'id': 1, 'mention': 'Taiwan'} {'start': 49, 'end': 56, 'type': 'GPE', 'id': 2, 'mention': 'BEIJING'} {'start': 57, 'end': 67, 'type': 'DATE', 'id': 3, 'mention': '1996-08-22'} {'start': 69, 'end': 74, 'type': 'GPE', 'id': 4, 'mention': 'China'} {'start': 78, 'end': 86, 'type': 'DATE', 'id': 5, 'mention': 'Thursday'} {'start': 95, 'end': 101, 'type': 'GPE', 'id': 6, 'mention': 'Taipei'} {'start': 162, 'end': 179, 'type': 'LOC', 'id': 7, 'mention': 'the Taiwan Strait'} {'start': 196, 'end': 203, 'type': 'GPE', 'id': 8, 'mention': 'Ukraine'} {'start': 207, 'end': 216, 'type': 'NORP', 'id': 9, 'mention': 'Taiwanese'} {'start': 232, 'end': 241, 'type': 'PERSON', 'id': 10, 'mention': 'Lien Chan'} {'start': 242, 'end': 251, 'type': 'DATE', 'id': 11, 'mention': 'this week'} {'start': 268, 'end': 275, 'type': 'GPE', 'id': 12, 'mention': 'Beijing'} {'start': 288, 'end': 298, 'type': 'TIME', 'id': 13, 'mention': 'only hours'} {'start': 305, 'end': 312, 'type': 'NORP', 'id': 14, 'mention': 'Chinese'} {'start': 383, 'end': 389, 'type': 'GPE', 'id': 15, 'mention': 'Taiwan'} {'start': 392, 'end': 408, 'type': 'ORG', 'id': 16, 'mention': 'Foreign Ministry'} {'start': 419, 'end': 431, 'type': 'PERSON', 'id': 17, 'mention': 'Shen Guofang'} {'start': 437, 'end': 444, 'type': 'ORG', 'id': 18, 'mention': 'Reuters'} {'start': 529, 'end': 535, 'type': 'GPE', 'id': 19, 'mention': 'Taiwan'} {'start': 553, 'end': 558, 'type': 'ORG', 'id': 20, 'mention': 'State'} {'start': 572, 'end': 577, 'type': 'GPE', 'id': 21, 'mention': 'China'} {'start': 601, 'end': 607, 'type': 'GPE', 'id': 22, 'mention': 'Taipei'} {'start': 610, 'end': 621, 'type': 'GPE', 'id': 23, 'mention': 'Tang Shubei'} {'start': 657, 'end': 663, 'type': 'GPE', 'id': 24, 'mention': 'Taiwan'} {'start': 667, 'end': 676, 'type': 'DATE', 'id': 25, 'mention': 'Wednesday'} {'start': 762, 'end': 765, 'type': 'CARDINAL', 'id': 26, 'mention': 'two'} {'start': 848, 'end': 856, 'type': 'DATE', 'id': 27, 'mention': 'Thursday'} {'start': 880, 'end': 899, 'type': 'ORG', 'id': 28, 'mention': "the People 's Daily"} {'start': 907, 'end': 911, 'type': 'PERSON', 'id': 29, 'mention': 'Tang'} {'start': 949, 'end': 953, 'type': 'ORG', 'id': 30, 'mention': 'Shen'} {'start': 959, 'end': 977, 'type': 'ORG', 'id': 31, 'mention': 'Reuters Television'} {'start': 1017, 'end': 1021, 'type': 'PERSON', 'id': 32, 'mention': 'Tang'} {'start': 1121, 'end': 1127, 'type': 'GPE', 'id': 33, 'mention': 'Taiwan'} {'start': 1136, 'end': 1143, 'type': 'GPE', 'id': 34, 'mention': 'Beijing'} {'start': 1177, 'end': 1182, 'type': 'GPE', 'id': 35, 'mention': 'China'} {'start': 1212, 'end': 1218, 'type': 'GPE', 'id': 36, 'mention': 'Taipei'} {'start': 1300, 'end': 1307, 'type': 'GPE', 'id': 37, 'mention': 'Ukraine'} {'start': 1308, 'end': 1317, 'type': 'DATE', 'id': 38, 'mention': 'this week'} {'start': 1321, 'end': 1330, 'type': 'NORP', 'id': 39, 'mention': 'Taiwanese'} {'start': 1346, 'end': 1350, 'type': 'PERSON', 'id': 40, 'mention': 'Lien'}
# Print all the entities that have a different type between NER and the golden standard
for ann in to_eval_annset:
gold_ann_matched = list(gold_annset.within((ann.start, ann.end)))
if len(gold_ann_matched) == 1 and ann.type != gold_ann_matched[0].type:
print({'mention': txt[ann.start:ann.end], 'spacy_type': ann.type, 'AIDA_type': gold_ann_matched[0].type})
{'mention': 'China', 'spacy_type': 'GPE', 'AIDA_type': 'LOC'} {'mention': 'Taiwan', 'spacy_type': 'GPE', 'AIDA_type': 'LOC'} {'mention': 'BEIJING', 'spacy_type': 'GPE', 'AIDA_type': 'LOC'} {'mention': 'China', 'spacy_type': 'GPE', 'AIDA_type': 'LOC'} {'mention': 'Taipei', 'spacy_type': 'GPE', 'AIDA_type': 'LOC'} {'mention': 'Ukraine', 'spacy_type': 'GPE', 'AIDA_type': 'LOC'} {'mention': 'Taiwanese', 'spacy_type': 'NORP', 'AIDA_type': 'MISC'} {'mention': 'Lien Chan', 'spacy_type': 'PERSON', 'AIDA_type': 'PER'} {'mention': 'Beijing', 'spacy_type': 'GPE', 'AIDA_type': 'LOC'} {'mention': 'Chinese', 'spacy_type': 'NORP', 'AIDA_type': 'MISC'} {'mention': 'Taiwan', 'spacy_type': 'GPE', 'AIDA_type': 'LOC'} {'mention': 'Shen Guofang', 'spacy_type': 'PERSON', 'AIDA_type': 'PER'} {'mention': 'Taiwan', 'spacy_type': 'GPE', 'AIDA_type': 'LOC'} {'mention': 'China', 'spacy_type': 'GPE', 'AIDA_type': 'LOC'} {'mention': 'Taipei', 'spacy_type': 'GPE', 'AIDA_type': 'LOC'} {'mention': 'Tang Shubei', 'spacy_type': 'GPE', 'AIDA_type': 'PER'} {'mention': 'Taiwan', 'spacy_type': 'GPE', 'AIDA_type': 'LOC'} {'mention': 'Tang', 'spacy_type': 'PERSON', 'AIDA_type': 'PER'} {'mention': 'Tang', 'spacy_type': 'PERSON', 'AIDA_type': 'PER'} {'mention': 'Taiwan', 'spacy_type': 'GPE', 'AIDA_type': 'LOC'} {'mention': 'Beijing', 'spacy_type': 'GPE', 'AIDA_type': 'LOC'} {'mention': 'China', 'spacy_type': 'GPE', 'AIDA_type': 'LOC'} {'mention': 'Taipei', 'spacy_type': 'GPE', 'AIDA_type': 'LOC'} {'mention': 'Ukraine', 'spacy_type': 'GPE', 'AIDA_type': 'LOC'} {'mention': 'Taiwanese', 'spacy_type': 'NORP', 'AIDA_type': 'MISC'} {'mention': 'Lien', 'spacy_type': 'PERSON', 'AIDA_type': 'PER'}
# Build a dictionary that stores the possible conversions
conversionMap = {}
for ann in to_eval_annset:
gold_ann_matched = list(gold_annset.within((ann.start, ann.end)))
if len(gold_ann_matched) == 1 and ann.type != gold_ann_matched[0].type:
# If the name of the type is already inside the dictionary...
if str(ann.type) in conversionMap:
# ... if the value is the same, then ignore it (the information is already stored)
if conversionMap[ann.type] == gold_ann_matched[0].type:
continue
# ... if the value is different, then plot a warning. We will have to check which one of the two types is an error
else:
print(ann.type + 'type has more than one corrispondent AIDA types: ' + gold_ann_matched[0].type + ' and ' + conversionMap[ann.type])
# If the name of the type is not already inside the dictionary, then add it
else:
conversionMap[ann.type] = gold_ann_matched[0].type
conversionMap
GPEtype has more than one corrispondent AIDA types: PER and LOC
{'GPE': 'LOC', 'NORP': 'MISC', 'PERSON': 'PER'}
Of course this type of operation can be done only in our case, as it is necessary to check the relationships found. The for loop will in fact insert in the dictionary every difference found between NER and the golden standard. If NER misclassifies an entity we do not want it to be inside the conversion list, we want this to happen only if the misclassification is consistent, as it means that NER and AIDA call the same category in two different ways.
true_positive = 0
false_positive = 0
for ann in to_eval_annset:
gold_ann_matched = list(gold_annset.within((ann.start, ann.end)))
if len(gold_ann_matched) == 1 and ann.type == gold_ann_matched[0].type: # control on type added
true_positive += 1
else:
false_positive += 1
true_positive = 0
false_negative = 0
for gold_ann in gold_annset:
to_eval_ann_matched = list(to_eval_annset.within((gold_ann.start, gold_ann.end)))
if len(to_eval_ann_matched) == 1 and gold_ann.type == to_eval_ann_matched[0].type:
true_positive += 1
else:
false_negative += 1
print('True Positive = ' + str(true_positive))
print('False Positive = ' + str(false_positive))
print('False Negative = ' + str(false_negative))
True Positive = 4 False Positive = 35 False Negative = 28
# Add every other type to the conversion matrix. (set key and value equal)
# In this way, the for loop will always found a corresponding key in the conversion dictionary.
for ann in to_eval_annset:
if ann.type not in conversionMap:
conversionMap[ann.type] = ann.type
true_positive = 0
false_positive = 0
for ann in to_eval_annset:
gold_ann_matched = list(gold_annset.within((ann.start, ann.end)))
if len(gold_ann_matched) == 1 and conversionMap[ann.type] == gold_ann_matched[0].type: # control on type added
true_positive += 1
else:
false_positive += 1
true_positive = 0
false_negative = 0
for gold_ann in gold_annset:
to_eval_ann_matched = list(to_eval_annset.within((gold_ann.start, gold_ann.end)))
if len(to_eval_ann_matched) == 1 and gold_ann.type == conversionMap[to_eval_ann_matched[0].type]:
true_positive += 1
else:
false_negative += 1
print('True Positive = ' + str(true_positive))
print('False Positive = ' + str(false_positive))
print('False Negative = ' + str(false_negative))
True Positive = 29 False Positive = 10 False Negative = 3
true_positive = 0
false_positive = 0
for ann in to_eval_annset:
gold_ann_matched = list(gold_annset.within((ann.start, ann.end)))
if len(gold_ann_matched) == 1 \
and ann.start == gold_ann_matched[0].start \
and ann.end == gold_ann_matched[0].end \
and ann.type == gold_ann_matched[0].type: # control on type added
true_positive += 1
else:
false_positive += 1
exact_typed_precision = true_positive / (true_positive + false_positive)
exact_typed_precision
0.0975609756097561
true_positive = 0
false_negative = 0
for gold_ann in gold_annset:
to_eval_ann_matched = list(to_eval_annset.within((gold_ann.start, gold_ann.end)))
if len(to_eval_ann_matched) == 1 \
and gold_ann.start == to_eval_ann_matched[0].start \
and gold_ann.end == to_eval_ann_matched[0].end \
and gold_ann.type == to_eval_ann_matched[0].type:
true_positive += 1
else:
false_negative += 1
exact_typed_recall = true_positive / (true_positive + false_negative)
exact_typed_recall
0.125
true_positive = 0
false_positive = 0
for ann in to_eval_annset:
gold_ann_matched = list(gold_annset.within((ann.start, ann.end)))
if len(gold_ann_matched) == 1 \
and ann.start == gold_ann_matched[0].start \
and ann.end == gold_ann_matched[0].end \
and conversionMap[ann.type] == gold_ann_matched[0].type: # control on type added
true_positive += 1
else:
false_positive += 1
exact_typed_precision = true_positive / (true_positive + false_positive)
exact_typed_precision
0.7073170731707317
true_positive = 0
false_negative = 0
for gold_ann in gold_annset:
to_eval_ann_matched = list(to_eval_annset.within((gold_ann.start, gold_ann.end)))
if len(to_eval_ann_matched) == 1 \
and gold_ann.start == to_eval_ann_matched[0].start \
and gold_ann.end == to_eval_ann_matched[0].end \
and gold_ann.type == conversionMap[to_eval_ann_matched[0].type]:
true_positive += 1
else:
false_negative += 1
exact_typed_recall = true_positive / (true_positive + false_negative)
exact_typed_recall
0.90625
# Display the performed NER
eval_annotations_doc = Document.from_dict(doc2.to_dict())
eval_annotations_doc
The first thing I notice is that NER finds way too date entity that, based on our objective, could not really be important. This is the case for entities like "this week", and could also be the case for entities relative to days of the week. The same thing can be said for the TIME entity "only hours". For this last entity, we could also consider the string "hours after", getting a more important information of the collocation of the first event in relation to the second. The only date entity that will probably be useful is the first one: "1996-08-22".
The same thing can be said by the "cardinal" entity: entity "two" is probably not relevant.
Geopolitical Entities (GPE) and Locations (LOC) bear some errors. NER is fact able to distinghuish "the Taiwan Strait"(LOC) from its substring "Taiwan"(GPE). But it would probably be more accurate to identify only "Taiwan Strait", leaving the article "the". Another possible mistakes comes in line 7: instead of identifying the state "Taiwan" it could be better to identify "Taiwan authorities" as an institution (ORG). Lastly, NER identifies "Tang Shubei" as GPE ('Country, cities, states') while he is a person, later addressed and correctly classified as "Tang".
'Nationalities or religious or political groups' (NORP) is a difficult type to recognize. NER classification is not wrong, but, based on our objectives, we could prefer to have the entity "Taiwanese Vice President" as a reference to "Lien Chan", instead of only getting the nationality of the president. It is true that Lien Chan is Taiwanese, but probably the most important piece of information that the article wanted to convey with these world is not that he is a Taiwanese Vice President, but that he is the Vice President of Taiwan.
The 'Companies, agencies, institutions, etc.' (ORG) also shows some errors. For example "state media" is firstly not recognized as an entity. Then, some rows after, only "State" is recognized as an institution. This classification is probably to be considered wrong, as the "State" and "state media" refer to two similar but different institutions. While "Reuters" and "Reuters Television" is correctly recognized, "Shen" is probably the name of the person "Shen Guofang" cited earlier in the text, and not an organization. The root of this problem may be in the way the sentence is formulated: it could be that NER identified Shen as an object because of the precedent "'s". Then, it could have chose to identify him as an institution because of the vicinity with "foreign ministry": Shen is their spokesman. Another error to note is that "foreign ministry" is not recognized the second time (while "Foreign Ministry", a few lines above, is), this is probably because of the lack of capital letters.
txt
'China says Taiwan spoils atmosphere for talks . \nBEIJING 1996-08-22 \nChina on Thursday accused Taipei of spoiling the atmosphere for a resumption of talks across the Taiwan Strait with a visit to Ukraine by Taiwanese Vice President Lien Chan this week that infuriated Beijing . \nSpeaking only hours after Chinese state media said the time was right to engage in political talks with Taiwan , Foreign Ministry spokesman Shen Guofang told Reuters : " The necessary atmosphere for the opening of the talks has been disrupted by the Taiwan authorities . " \nState media quoted China \'s top negotiator with Taipei , Tang Shubei , as telling a visiting group from Taiwan on Wednesday that it was time for the rivals to hold political talks . \n" Now is the time for the two sides to engage in political talks ... \nthat is to end the state of hostility , " Thursday \'s overseas edition of the People \'s Daily quoted Tang as saying . \nThe foreign ministry \'s Shen told Reuters Television in an interview he had read reports of Tang \'s comments but gave no details of why the negotiator had considered the time right for talks with Taiwan , which Beijing considers a renegade province . \nChina , which has long opposed all Taipei efforts to gain greater international recognition , was infuriated by a visit to Ukraine this week by Taiwanese Vice President Lien . '
import spacy_transformers
import spacy_dbpedia_spotlight
ner.add_pipe('dbpedia_spotlight')
ner.pipeline
[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x1e6ecea7e80>), ('tagger', <spacy.pipeline.tagger.Tagger at 0x1e6ecea7d00>), ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x1e6ecd12d60>), ('attribute_ruler', <spacy.pipeline.attributeruler.AttributeRuler at 0x1e6ecf1b840>), ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x1e6ece9dd00>), ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x1e6ecd12cf0>), ('dbpedia_spotlight', <spacy_dbpedia_spotlight.entity_linker.EntityLinker at 0x1e6eea195b0>)]
doc_spotlight = ner(txt)
doc_spotlight.ents
(China, Taiwan, BEIJING, China, Taipei, Taiwan Strait, Ukraine, Taiwanese, Vice President, Chan, Beijing, Chinese, Taiwan, Foreign Ministry, Reuters, Taiwan, China, Taipei, Tang, Taiwan, Tang, Reuters, Tang, Taiwan, Beijing, China, Taipei, Ukraine, Taiwanese, Vice President)
import pandas as pd
# access dbpedia links using `ent.kb_id_`
pd.DataFrame(
[(ent.text, ent.kb_id_, ent._.dbpedia_raw_result['@similarityScore']) for ent in doc_spotlight.ents],
columns = ['mention', 'link', 'similarity']
)
mention | link | similarity | |
---|---|---|---|
0 | China | http://dbpedia.org/resource/Taiwan | 0.9576150612102466 |
1 | Taiwan | http://dbpedia.org/resource/Taiwan | 0.9999886037265016 |
2 | BEIJING | http://dbpedia.org/resource/Government_of_China | 0.9633710026526001 |
3 | China | http://dbpedia.org/resource/Taiwan | 0.9576150612102466 |
4 | Taipei | http://dbpedia.org/resource/Government_of_the_... | 0.976859540640933 |
5 | Taiwan Strait | http://dbpedia.org/resource/Cross-Strait_relat... | 0.9999997051382232 |
6 | Ukraine | http://dbpedia.org/resource/Ukraine | 0.9997112846180765 |
7 | Taiwanese | http://dbpedia.org/resource/Taiwanese_people | 0.9931038004023124 |
8 | Vice President | http://dbpedia.org/resource/Vice_President_of_... | 0.9999999990778861 |
9 | Chan | http://dbpedia.org/resource/Jackie_Chan | 0.7107728194134195 |
10 | Beijing | http://dbpedia.org/resource/Government_of_China | 0.9633710026526001 |
11 | Chinese | http://dbpedia.org/resource/Chinese_language | 0.6431193698723345 |
12 | Taiwan | http://dbpedia.org/resource/Taiwan | 0.9999886037265016 |
13 | Foreign Ministry | http://dbpedia.org/resource/Ministry_of_Foreig... | 0.9999999998583462 |
14 | Reuters | http://dbpedia.org/resource/Reuters | 0.9999999982114787 |
15 | Taiwan | http://dbpedia.org/resource/Taiwan | 0.9999886037265016 |
16 | China | http://dbpedia.org/resource/Taiwan | 0.9576150612102466 |
17 | Taipei | http://dbpedia.org/resource/Government_of_the_... | 0.976859540640933 |
18 | Tang | http://dbpedia.org/resource/Names_of_China | 0.7477847217912609 |
19 | Taiwan | http://dbpedia.org/resource/Taiwan | 0.9999886037265016 |
20 | Tang | http://dbpedia.org/resource/Names_of_China | 0.7477847217912609 |
21 | Reuters | http://dbpedia.org/resource/Reuters | 0.9999999982114787 |
22 | Tang | http://dbpedia.org/resource/Names_of_China | 0.7477847217912609 |
23 | Taiwan | http://dbpedia.org/resource/Taiwan | 0.9999886037265016 |
24 | Beijing | http://dbpedia.org/resource/Government_of_China | 0.9633710026526001 |
25 | China | http://dbpedia.org/resource/Taiwan | 0.9576150612102466 |
26 | Taipei | http://dbpedia.org/resource/Government_of_the_... | 0.976859540640933 |
27 | Ukraine | http://dbpedia.org/resource/Ukraine | 0.9997112846180765 |
28 | Taiwanese | http://dbpedia.org/resource/Taiwanese_people | 0.9931038004023124 |
29 | Vice President | http://dbpedia.org/resource/Vice_President_of_... | 0.9999999990778861 |
import wikipedia
def generate_candidates_wikipedia(mentions):
candidates = {} # {mention1: [cand11, cand12], mention2: [cand21, cand22]}
for mention in mentions:
# search gives title of most relevant pages
pages = wikipedia.search(mention, results = 10)
filtered_pages = []
for title in pages:
try:
# use page to get the page
page = wikipedia.page(title)
filtered_pages.append(page)
except wikipedia.DisambiguationError:
# filter out disambiguation pages
pass
except wikipedia.PageError:
# ignore pageerrors
pass
candidates[mention] = filtered_pages
return candidates
# get the list of mentions from the annset
mentions = [txt[ann.start:ann.end] for ann in gold_annotations_doc.annset('gold')] # in this example the gold annotations are used
mentions
['China', 'Taiwan', 'BEIJING', 'China', 'Taipei', 'Taiwan Strait', 'Ukraine', 'Taiwanese', 'Lien Chan', 'Beijing', 'Chinese', 'Taiwan', 'Foreign Ministry', 'Shen Guofang', 'Reuters', 'Taiwan', 'China', 'Taipei', 'Tang Shubei', 'Taiwan', "People 's Daily", 'Tang', 'Shen', 'Reuters Television', 'Tang', 'Taiwan', 'Beijing', 'China', 'Taipei', 'Ukraine', 'Taiwanese', 'Lien']
candidates = generate_candidates_wikipedia(mentions) # set removes duplicates
from pprint import pprint # pretty print
pprint(candidates)
{'BEIJING': [<WikipediaPage 'Beijing'>, <WikipediaPage 'Beijing Radio and Television Station'>, <WikipediaPage 'BAIC Group'>, <WikipediaPage '2022 Winter Olympics'>, <WikipediaPage 'Politics of Beijing'>, <WikipediaPage 'Beijing Guoan F.C.'>, <WikipediaPage 'Beijing Subway'>, <WikipediaPage '2012 Summer Olympics'>, <WikipediaPage '1989 Tiananmen Square protests and massacre'>], 'Beijing': [<WikipediaPage 'Beijing'>, <WikipediaPage 'Beijing Radio and Television Station'>, <WikipediaPage 'BAIC Group'>, <WikipediaPage '2022 Winter Olympics'>, <WikipediaPage 'Politics of Beijing'>, <WikipediaPage 'Beijing Subway'>, <WikipediaPage '2012 Summer Olympics'>, <WikipediaPage 'G4 Beijing–Hong Kong and Macau Expressway'>, <WikipediaPage '1989 Tiananmen Square protests and massacre'>], 'China': [<WikipediaPage 'Taiwan'>, <WikipediaPage 'History of China'>, <WikipediaPage 'Republic of China (1912–1949)'>, <WikipediaPage 'Chinese language'>, <WikipediaPage 'Mainland China'>, <WikipediaPage 'Chinese people'>, <WikipediaPage 'List of regions of China'>], 'Chinese': [<WikipediaPage 'Chinese language'>, <WikipediaPage 'History of China'>, <WikipediaPage 'Chinese characters'>, <WikipediaPage 'Rabbit (zodiac)'>, <WikipediaPage 'Standard Chinese'>, <WikipediaPage 'Mandarin Chinese'>, <WikipediaPage 'Taiwan'>], 'Foreign Ministry': [<WikipediaPage 'Ministry of foreign affairs'>, <WikipediaPage 'Ministry of Foreign Affairs (Israel)'>, <WikipediaPage 'Ministry for Europe and Foreign Affairs (France)'>, <WikipediaPage 'Ministry of Foreign Affairs (Russia)'>, <WikipediaPage 'Ministry of Foreign Affairs of the People's Republic of China'>, <WikipediaPage 'Ministry of Foreign Affairs (Afghanistan)'>, <WikipediaPage 'Foreign, Commonwealth and Development Office'>, <WikipediaPage 'Ministry of Foreign Affairs (Japan)'>, <WikipediaPage 'Ministry of Foreign Affairs (Pakistan)'>, <WikipediaPage 'Ministry of Foreign Affairs (Ukraine)'>], 'Lien': [<WikipediaPage 'Jennifer Lien'>, <WikipediaPage 'University of Pau and the Adour Region'>, <WikipediaPage 'Mechanic's lien'>, <WikipediaPage 'Tax lien'>, <WikipediaPage 'Phương Liên'>, <WikipediaPage 'Đinh dynasty'>, <WikipediaPage 'Crouching Tiger, Hidden Dragon'>, <WikipediaPage 'Banker's lien'>], 'Lien Chan': [<WikipediaPage 'Lien Chan'>, <WikipediaPage 'Ma Ying-jeou'>, <WikipediaPage 'Kuomintang'>, <WikipediaPage '2005 Pan–Blue visits to mainland China'>, <WikipediaPage '2000 Taiwanese presidential election'>, <WikipediaPage 'Lee Teng-hui'>, <WikipediaPage '2004 Taiwanese presidential election'>, <WikipediaPage '1996 Taiwanese presidential election'>, <WikipediaPage 'Vincent Siew'>], "People 's Daily": [<WikipediaPage 'People's Daily'>, <WikipediaPage 'People's Court Daily'>, <WikipediaPage 'People's World'>, <WikipediaPage 'People's Liberation Army Daily'>, <WikipediaPage 'Dziennik Ludowy (People's Daily)'>, <WikipediaPage 'The Daily Show'>, <WikipediaPage 'The Daily Telegraph'>, <WikipediaPage 'Everyday life'>], 'Reuters': [<WikipediaPage 'Reuters'>, <WikipediaPage 'Thomson Reuters'>, <WikipediaPage 'Fritz Reuter'>, <WikipediaPage 'Paul Reuter'>, <WikipediaPage 'Suzanne Reuter'>, <WikipediaPage 'Reuters Group'>, <WikipediaPage 'Ernst Reuter'>, <WikipediaPage 'Reuter concession'>, <WikipediaPage 'Saudi Arabian–led intervention in Yemen'>], 'Reuters Television': [<WikipediaPage 'Reuters'>, <WikipediaPage 'Thomson Reuters'>, <WikipediaPage 'Reuters Group'>, <WikipediaPage 'Reuters TV'>, <WikipediaPage 'The Refinitiv Business Classification'>, <WikipediaPage '2017 in American television'>, <WikipediaPage 'Suzanne Reuter'>, <WikipediaPage 'Jules Asner'>, <WikipediaPage 'Christine Romans'>], 'Shen': [<WikipediaPage 'Shěn'>, <WikipediaPage 'Shen Changyin and Shen Changping'>, <WikipediaPage 'Shen Teng'>, <WikipediaPage 'Shen Congwen'>, <WikipediaPage 'Shen Quan'>, <WikipediaPage 'O-Shen'>, <WikipediaPage 'Shen Gongbao'>], 'Shen Guofang': [<WikipediaPage 'Murray's law'>, <WikipediaPage 'Porous medium'>, <WikipediaPage 'Guofang Wei'>, <WikipediaPage 'List of members of the Chinese Academy of Engineering'>, <WikipediaPage 'Ministry of Foreign Affairs of the People's Republic of China'>, <WikipediaPage 'List of Foreign Ministry Spokespersons of the People's Republic of China'>, <WikipediaPage 'Diagnostic delay'>, <WikipediaPage 'National Defense Medical Center'>, <WikipediaPage 'Central Military Commission (China)'>, <WikipediaPage 'List of ships of the People's Liberation Army Navy'>], 'Taipei': [<WikipediaPage 'Taipei'>, <WikipediaPage 'Taipei 101'>, <WikipediaPage 'Chinese Taipei'>, <WikipediaPage 'Taiwan'>, <WikipediaPage 'New Taipei City'>, <WikipediaPage 'Taipei Metro'>, <WikipediaPage '2023 Taipei Open'>, <WikipediaPage 'Taoyuan International Airport'>, <WikipediaPage 'Taipei Fubon Bank'>], 'Taiwan': [<WikipediaPage 'Taiwan'>, <WikipediaPage 'Regions of Taiwan'>, <WikipediaPage 'History of Taiwan'>, <WikipediaPage 'Taiwanese people'>, <WikipediaPage 'Taiwanese indigenous peoples'>, <WikipediaPage 'Political status of Taiwan'>, <WikipediaPage 'Geography of Taiwan'>, <WikipediaPage 'Taiwan, China'>], 'Taiwan Strait': [<WikipediaPage 'Taiwan Strait'>, <WikipediaPage 'Third Taiwan Strait Crisis'>, <WikipediaPage 'Second Taiwan Strait Crisis'>, <WikipediaPage 'First Taiwan Strait Crisis'>, <WikipediaPage 'Cross-Strait relations'>, <WikipediaPage 'Political status of Taiwan'>, <WikipediaPage 'Taiwan Strait Crises'>, <WikipediaPage '2022 Chinese military exercises around Taiwan'>, <WikipediaPage 'Geography of Taiwan'>, <WikipediaPage 'Taiwan Strait Tunnel Project'>], 'Taiwanese': [<WikipediaPage 'Taiwan'>, <WikipediaPage 'Taiwanese Hokkien'>, <WikipediaPage 'Taiwanese people'>, <WikipediaPage 'Taiwanese indigenous peoples'>, <WikipediaPage 'Taiwanese Americans'>, <WikipediaPage 'Taiwanese cuisine'>, <WikipediaPage 'Han Taiwanese'>, <WikipediaPage 'Taiwanese Hakka'>, <WikipediaPage 'Taiwanese Mandarin'>], 'Tang': [<WikipediaPage 'Tank'>, <WikipediaPage 'Song dynasty'>, <WikipediaPage 'Wu-Tang Clan'>, <WikipediaPage 'Later Tang'>, <WikipediaPage 'Tang Wei'>, <WikipediaPage 'Tang Sanzang'>, <WikipediaPage 'Tang dynasty'>, <WikipediaPage 'Tang (drink mix)'>, <WikipediaPage 'Tangs'>], 'Tang Shubei': [<WikipediaPage 'Qiandao Lake incident'>, <WikipediaPage 'Fallen City'>], 'Ukraine': [<WikipediaPage 'Ukraine'>, <WikipediaPage 'Russian invasion of Ukraine'>, <WikipediaPage 'Ukrainians'>, <WikipediaPage 'Ukrainian language'>, <WikipediaPage 'Armed Forces of Ukraine'>, <WikipediaPage 'Ukrainian Soviet Socialist Republic'>, <WikipediaPage '2023 Belgorod Oblast incursion'>, <WikipediaPage 'History of Ukraine'>]}
import textdistance
jaccard = textdistance.Jaccard(qval = 2)
candidate_rankings = {}
for mention, m_candidates in candidates.items():
similarities = [
jaccard.similarity(mention.lower(), m_cand.title.lower())
for m_cand in m_candidates
]
cand_and_sim = list(zip(m_candidates, similarities))
cand_and_sim.sort(reverse=True, key=lambda x: x[1]) # sort descending by similarity
candidate_rankings[mention] = cand_and_sim
best_candidates = {mention:cands[0] for mention, cands in candidate_rankings.items()}
pprint(best_candidates)
{'BEIJING': (<WikipediaPage 'Beijing'>, 1), 'Beijing': (<WikipediaPage 'Beijing'>, 1), 'China': (<WikipediaPage 'Mainland China'>, 0.3076923076923077), 'Chinese': (<WikipediaPage 'Chinese language'>, 0.4), 'Foreign Ministry': (<WikipediaPage 'Ministry of foreign affairs'>, 0.5185185185185185), 'Lien': (<WikipediaPage 'Tax lien'>, 0.42857142857142855), 'Lien Chan': (<WikipediaPage 'Lien Chan'>, 1), "People 's Daily": (<WikipediaPage 'People's Daily'>, 0.8), 'Reuters': (<WikipediaPage 'Reuters'>, 1), 'Reuters Television': (<WikipediaPage 'Reuters TV'>, 0.4444444444444444), 'Shen': (<WikipediaPage 'O-Shen'>, 0.6), 'Shen Guofang': (<WikipediaPage 'Guofang Wei'>, 0.4), 'Taipei': (<WikipediaPage 'Taipei'>, 1), 'Taiwan': (<WikipediaPage 'Taiwan'>, 1), 'Taiwan Strait': (<WikipediaPage 'Taiwan Strait'>, 1), 'Taiwanese': (<WikipediaPage 'Han Taiwanese'>, 0.6666666666666666), 'Tang': (<WikipediaPage 'Tangs'>, 0.75), 'Tang Shubei': (<WikipediaPage 'Qiandao Lake incident'>, 0.034482758620689655), 'Ukraine': (<WikipediaPage 'Ukraine'>, 1)}
import requests
api_biencoder = 'https://vm.chronos.disco.unimib.it/api/blink/biencoder/mention/doc'
api_indexer = 'https://vm.chronos.disco.unimib.it/api/indexer/search/doc/10'
api_crossencoder = 'https://vm.chronos.disco.unimib.it/api/blink/crossencoder/doc/10'
gold_path = 'AIDA-YAGO2_2_GATE.json' # reassign this variable with your dataset
with open(gold_path, 'r') as inp:
gold_dict = json.load(inp)
# rename needed to use the API
gold_dict['annotation_sets']['entities_merged'] = gold_dict['annotation_sets']['gold']
gold_dict['annotation_sets']['entities_merged']['name'] = 'entities_merged'
for ann in gold_dict['annotation_sets']['entities_merged']['annotations']:
ann['features'] = {}
gold_annotations_doc = Document.from_dict(gold_dict)
gold_annotations_doc
import numpy as np
import base64
def vector_encode(v):
s = base64.b64encode(v).decode()
return s
def vector_decode(s, dtype=np.float32):
buffer = base64.b64decode(s)
v = np.frombuffer(buffer, dtype=dtype)
return v
from requests.auth import HTTPBasicAuth
auth = HTTPBasicAuth('DS2023', 'eexeegheichai3OhChi5AhcheecaaShe')
gold_annotations_doc.features['pipeline'] = [] # force the API to run
gold_annotations_doc.features['pipeline']
[]
# call the Bi-Encoder only to encode the mentions and their contexts in a vect
res = requests.post(api_biencoder, auth = auth, json = gold_annotations_doc.to_dict())
res
<Response [200]>
assert res.ok
doc = res.json()
gdoc = Document.from_dict(doc)
gdoc
# obtain the candidates ranking through nearest neighbour using the vectors encoded by Bi-Encoder
res = requests.post(api_indexer, auth = auth, json = gdoc.to_dict())
if res.ok:
doc = res.json()
gdocBi = Document.from_dict(doc)
else:
print('error')
print(res.content)
for ann in gdocBi.annset('entities_merged'):
print(ann.features['linking']['top_candidate'])
{'raw_score': 326.15081787109375, 'id': 2397, 'wikipedia_id': 5405, 'wikidata_qid': None, 'redirects_to': None, 'title': 'China', 'url': 'https://en.wikipedia.org/wiki?curid=5405', 'type_': None, 'indexer': 0, 'score': 81.44474029541016, 'norm_score': 0.5004747453135283} {'raw_score': 322.3394775390625, 'id': 12748, 'wikipedia_id': 25734, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Taiwan', 'url': 'https://en.wikipedia.org/wiki?curid=25734', 'type_': None, 'indexer': 0, 'score': 83.04270935058594, 'norm_score': 0.5179859472060483} {'raw_score': 324.72314453125, 'id': 2301073, 'wikipedia_id': 18603746, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Beijing', 'url': 'https://en.wikipedia.org/wiki?curid=18603746', 'type_': None, 'indexer': 0, 'score': 82.0356674194336, 'norm_score': 0.4883515463892926} {'raw_score': 325.8661193847656, 'id': 2397, 'wikipedia_id': 5405, 'wikidata_qid': None, 'redirects_to': None, 'title': 'China', 'url': 'https://en.wikipedia.org/wiki?curid=5405', 'type_': None, 'indexer': 0, 'score': 81.20738220214844, 'norm_score': 0.49901618910913265} {'raw_score': 328.84820556640625, 'id': 28984, 'wikipedia_id': 57648, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Taipei', 'url': 'https://en.wikipedia.org/wiki?curid=57648', 'type_': None, 'indexer': 0, 'score': 79.6422348022461, 'norm_score': 0.47778394320151724} {'raw_score': 334.675537109375, 'id': 42870, 'wikipedia_id': 91052, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Taiwan Strait', 'url': 'https://en.wikipedia.org/wiki?curid=91052', 'type_': None, 'indexer': 0, 'score': 82.00558471679688, 'norm_score': 0.496255599517426} {'raw_score': 317.92822265625, 'id': 15756, 'wikipedia_id': 31750, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Ukraine', 'url': 'https://en.wikipedia.org/wiki?curid=31750', 'type_': None, 'indexer': 0, 'score': 87.6226806640625, 'norm_score': 0.554060116055794} {'raw_score': 326.936767578125, 'id': 12748, 'wikipedia_id': 25734, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Taiwan', 'url': 'https://en.wikipedia.org/wiki?curid=25734', 'type_': None, 'indexer': 0, 'score': 81.32420349121094, 'norm_score': 0.5072666210628048} {'raw_score': 339.0760192871094, 'id': 82451, 'wikipedia_id': 140551, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Lien Chan', 'url': 'https://en.wikipedia.org/wiki?curid=140551', 'type_': None, 'indexer': 0, 'score': 80.01026153564453, 'norm_score': 0.4225672789540661} {'raw_score': 327.8848876953125, 'id': 2301073, 'wikipedia_id': 18603746, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Beijing', 'url': 'https://en.wikipedia.org/wiki?curid=18603746', 'type_': None, 'indexer': 0, 'score': 80.9153823852539, 'norm_score': 0.481682582193009} {'raw_score': 326.45635986328125, 'id': 2397, 'wikipedia_id': 5405, 'wikidata_qid': None, 'redirects_to': None, 'title': 'China', 'url': 'https://en.wikipedia.org/wiki?curid=5405', 'type_': None, 'indexer': 0, 'score': 80.62527465820312, 'norm_score': 0.49543916100707935} {'raw_score': 323.4985656738281, 'id': 12748, 'wikipedia_id': 25734, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Taiwan', 'url': 'https://en.wikipedia.org/wiki?curid=25734', 'type_': None, 'indexer': 0, 'score': 83.46719360351562, 'norm_score': 0.5206337037586383} {'raw_score': 326.993896484375, 'id': 1574394, 'wikipedia_id': 10789929, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Ministry of Foreign Affairs (Taiwan)', 'url': 'https://en.wikipedia.org/wiki?curid=10789929', 'type_': None, 'indexer': 0, 'score': 82.36274719238281, 'norm_score': 0.49877553104761246} {'raw_score': 337.355712890625, 'id': 189629, 'wikipedia_id': 457455, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Zeng Guofan', 'url': 'https://en.wikipedia.org/wiki?curid=457455', 'type_': None, 'indexer': 0, 'score': 81.17063903808594, 'norm_score': 0.4463794058794387} {'raw_score': 326.7002258300781, 'id': 2350980, 'wikipedia_id': 18998750, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Reuters', 'url': 'https://en.wikipedia.org/wiki?curid=18998750', 'type_': None, 'indexer': 0, 'score': 83.73574829101562, 'norm_score': 0.5179096424892523} {'raw_score': 323.9019775390625, 'id': 12748, 'wikipedia_id': 25734, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Taiwan', 'url': 'https://en.wikipedia.org/wiki?curid=25734', 'type_': None, 'indexer': 0, 'score': 83.23236083984375, 'norm_score': 0.5191689144655522} {'raw_score': 325.56353759765625, 'id': 2397, 'wikipedia_id': 5405, 'wikidata_qid': None, 'redirects_to': None, 'title': 'China', 'url': 'https://en.wikipedia.org/wiki?curid=5405', 'type_': None, 'indexer': 0, 'score': 81.94345092773438, 'norm_score': 0.5035393026537852} {'raw_score': 327.9543151855469, 'id': 28984, 'wikipedia_id': 57648, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Taipei', 'url': 'https://en.wikipedia.org/wiki?curid=57648', 'type_': None, 'indexer': 0, 'score': 80.64089965820312, 'norm_score': 0.4837750612810223} {'raw_score': 339.0580749511719, 'id': 555105, 'wikipedia_id': 2099360, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Tang Fei', 'url': 'https://en.wikipedia.org/wiki?curid=2099360', 'type_': None, 'indexer': 0, 'score': 79.37348937988281, 'norm_score': 0.43356447857278485} {'raw_score': 323.1243591308594, 'id': 12748, 'wikipedia_id': 25734, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Taiwan', 'url': 'https://en.wikipedia.org/wiki?curid=25734', 'type_': None, 'indexer': 0, 'score': 83.52281188964844, 'norm_score': 0.5209806275385797} {'raw_score': 325.2178955078125, 'id': 129288, 'wikipedia_id': 263163, 'wikidata_qid': None, 'redirects_to': None, 'title': "People's Daily", 'url': 'https://en.wikipedia.org/wiki?curid=263163', 'type_': None, 'indexer': 0, 'score': 85.50372314453125, 'norm_score': 0.49932806565854604} {'raw_score': 329.8238525390625, 'id': 4192559, 'wikipedia_id': 39652240, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Tāng (surname)', 'url': 'https://en.wikipedia.org/wiki?curid=39652240', 'type_': None, 'indexer': 0, 'score': 79.84367370605469, 'norm_score': 0.48148635451503247} {'raw_score': 329.2820739746094, 'id': 5744441, 'wikipedia_id': 59178284, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Shen Chang-huan', 'url': 'https://en.wikipedia.org/wiki?curid=59178284', 'type_': None, 'indexer': 0, 'score': 79.95675659179688, 'norm_score': 0.43394921489291627} {'raw_score': 333.9736633300781, 'id': 5134298, 'wikipedia_id': 51220220, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Reuters TV', 'url': 'https://en.wikipedia.org/wiki?curid=51220220', 'type_': None, 'indexer': 0, 'score': 81.90070343017578, 'norm_score': 0.46144705581830214} {'raw_score': 327.4975891113281, 'id': 2273003, 'wikipedia_id': 18353912, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Ignacio Milam Tang', 'url': 'https://en.wikipedia.org/wiki?curid=18353912', 'type_': None, 'indexer': 0, 'score': 81.07706451416016, 'norm_score': 0.4543629102792197} {'raw_score': 322.83648681640625, 'id': 12748, 'wikipedia_id': 25734, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Taiwan', 'url': 'https://en.wikipedia.org/wiki?curid=25734', 'type_': None, 'indexer': 0, 'score': 83.43959045410156, 'norm_score': 0.5204615267715562} {'raw_score': 328.26019287109375, 'id': 2301073, 'wikipedia_id': 18603746, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Beijing', 'url': 'https://en.wikipedia.org/wiki?curid=18603746', 'type_': None, 'indexer': 0, 'score': 80.88457489013672, 'norm_score': 0.4814991877213832} {'raw_score': 326.48834228515625, 'id': 2397, 'wikipedia_id': 5405, 'wikidata_qid': None, 'redirects_to': None, 'title': 'China', 'url': 'https://en.wikipedia.org/wiki?curid=5405', 'type_': None, 'indexer': 0, 'score': 80.95240020751953, 'norm_score': 0.49744933471978137} {'raw_score': 327.7627258300781, 'id': 28984, 'wikipedia_id': 57648, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Taipei', 'url': 'https://en.wikipedia.org/wiki?curid=57648', 'type_': None, 'indexer': 0, 'score': 80.20447540283203, 'norm_score': 0.4811568964070991} {'raw_score': 319.93878173828125, 'id': 15756, 'wikipedia_id': 31750, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Ukraine', 'url': 'https://en.wikipedia.org/wiki?curid=31750', 'type_': None, 'indexer': 0, 'score': 85.62303161621094, 'norm_score': 0.5414158352014888} {'raw_score': 326.2209777832031, 'id': 12748, 'wikipedia_id': 25734, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Taiwan', 'url': 'https://en.wikipedia.org/wiki?curid=25734', 'type_': None, 'indexer': 0, 'score': 81.30545806884766, 'norm_score': 0.5071496948999357} {'raw_score': 334.766357421875, 'id': 82451, 'wikipedia_id': 140551, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Lien Chan', 'url': 'https://en.wikipedia.org/wiki?curid=140551', 'type_': None, 'indexer': 0, 'score': 80.24974822998047, 'norm_score': 0.42383210722520176}
gdoc = gdocBi
# reranking the candidates with cross-encoder (expected to be more accurate than only the bi-encoder)
# it's normal if this call is slow, do not launch too many times this cell
res = requests.post(api_crossencoder, auth = auth, json = gdoc.to_dict())
if res.ok:
doc = res.json()
gdoc = Document.from_dict(doc)
else:
print('error')
print(res.content)
gdoc
for ann in gdoc.annset('entities_merged'):
break
ann.features['linking']['candidates']
[{'id': 2397, 'title': 'China', 'url': 'https://en.wikipedia.org/wiki?curid=5405', 'indexer': 0, 'score': 6.211941719055176, 'bi_score': 81.44474029541016, 'raw_score': 326.15081787109375, 'is_cross': True, 'wikipedia_id': 5405, 'type_': None, 'norm_score': 0.5004747453135283}, {'id': 12748, 'title': 'Taiwan', 'url': 'https://en.wikipedia.org/wiki?curid=25734', 'indexer': 0, 'score': -1.6825027465820312, 'bi_score': 78.55805969238281, 'raw_score': 331.9241638183594, 'is_cross': True, 'wikipedia_id': 25734, 'type_': None, 'norm_score': 0.4900125643617512}, {'id': 750050, 'title': 'Government of China', 'url': 'https://en.wikipedia.org/wiki?curid=3205521', 'indexer': 0, 'score': -6.467350482940674, 'bi_score': 78.4769287109375, 'raw_score': 332.08642578125, 'is_cross': True, 'wikipedia_id': 3205521, 'type_': None, 'norm_score': 0.4881161747591437}, {'id': 1403259, 'title': 'Government of the Republic of China', 'url': 'https://en.wikipedia.org/wiki?curid=8717276', 'indexer': 0, 'score': -6.642155170440674, 'bi_score': 78.0320053100586, 'raw_score': 332.97625732421875, 'is_cross': True, 'wikipedia_id': 8717276, 'type_': None, 'norm_score': 0.48854284955788235}, {'id': 67524, 'title': 'China Airlines', 'url': 'https://en.wikipedia.org/wiki?curid=124485', 'indexer': 0, 'score': -6.757944583892822, 'bi_score': 77.9298095703125, 'raw_score': 333.1806640625, 'is_cross': True, 'wikipedia_id': 124485, 'type_': None, 'norm_score': 0.44623364529866577}, {'id': 1120926, 'title': 'United States of China', 'url': 'https://en.wikipedia.org/wiki?curid=5980963', 'indexer': 0, 'score': -7.1964030265808105, 'bi_score': 78.06914520263672, 'raw_score': 332.9019775390625, 'is_cross': True, 'wikipedia_id': 5980963, 'type_': None, 'norm_score': 0.480572955431974}, {'id': 322457, 'title': 'China Times', 'url': 'https://en.wikipedia.org/wiki?curid=987480', 'indexer': 0, 'score': -7.954238414764404, 'bi_score': 78.10570526123047, 'raw_score': 332.82891845703125, 'is_cross': True, 'wikipedia_id': 987480, 'type_': None, 'norm_score': 0.4364163766788583}, {'id': 5762366, 'title': 'Foreign policy of China', 'url': 'https://en.wikipedia.org/wiki?curid=59502393', 'indexer': 0, 'score': -8.213769912719727, 'bi_score': 78.18390655517578, 'raw_score': 332.6724853515625, 'is_cross': True, 'wikipedia_id': 59502393, 'type_': None, 'norm_score': 0.4912969349358342}, {'id': 4343573, 'title': 'China Current', 'url': 'https://en.wikipedia.org/wiki?curid=41257924', 'indexer': 0, 'score': -8.65864372253418, 'bi_score': 77.98200988769531, 'raw_score': 333.0762939453125, 'is_cross': True, 'wikipedia_id': 41257924, 'type_': None, 'norm_score': 0.45241265356018845}, {'id': 157682, 'title': 'China (disambiguation)', 'url': 'https://en.wikipedia.org/wiki?curid=356842', 'indexer': 0, 'score': -12.302020072937012, 'bi_score': 77.96675109863281, 'raw_score': 333.1067810058594, 'is_cross': True, 'wikipedia_id': 356842, 'type_': None, 'norm_score': 0.5666465468999536}]
# Build a dictionary that links entities to their found wikipedia page title for the Naive NEL
NaiveLink = {}
for ann in best_candidates:
NaiveLink[ann] = best_candidates[ann][0].original_title
# Build two lists, one for names of the entities and one for the found wikipedia page title for the bi-encoder
names = []
BiCandidate = []
for ann in gdocBi.annset('entities_merged'):
names.append(txt[ann.start:ann.end])
BiCandidate.append(ann.features['linking']['top_candidate']['title'])
# Build one list, for the found wikipedia page title for the BLINK approach
BlinkCandidate = []
i = 0
for ann in gdoc.annset('entities_merged'):
# If the two lists of names of the entities are not exactly the same we risk to build a wrong dataframe. Raise an erro.
if txt[ann.start:ann.end] != names[i]:
print('Error, the two lists of entities are not the same: ' + txt[ann.start:ann.end] + ' is not equal to ' + names[i])
break
i += 1
BlinkCandidate.append(ann.features['linking']['top_candidate']['title'])
# Build one list, for the found wikipedia page title for the naive approach, using the previously built linking dictionary
NaiveCandidate = []
for name in names:
NaiveCandidate.append(NaiveLink[name])
data = {'names': names,
'NaiveCandidate': NaiveCandidate,
'BiCandidate': BiCandidate,
'BlinkCandidate': BlinkCandidate}
I believe that this approach (that force to mantain the entities duplicate) is better, because the BLINK candidates can change their linked Wikipedia page based on the context. Therefore, while this is not possible for the naive NEL approach, BLINK approaches could link different Wikipedia pages for two entities with the exact same name.
df = pd.DataFrame(data)
df
names | NaiveCandidate | BiCandidate | BlinkCandidate | |
---|---|---|---|---|
0 | China | Mainland China | China | China |
1 | Taiwan | Taiwan | Taiwan | Taiwan |
2 | BEIJING | Beijing | Beijing | Beijing |
3 | China | Mainland China | China | China |
4 | Taipei | Taipei | Taipei | Taipei |
5 | Taiwan Strait | Taiwan Strait | Taiwan Strait | Taiwan Strait |
6 | Ukraine | Ukraine | Ukraine | Ukraine |
7 | Taiwanese | Han Taiwanese | Taiwan | Taiwan |
8 | Lien Chan | Lien Chan | Lien Chan | Lien Chan |
9 | Beijing | Beijing | Beijing | Beijing |
10 | Chinese | Chinese language | China | China |
11 | Taiwan | Taiwan | Taiwan | Taiwan |
12 | Foreign Ministry | Ministry of foreign affairs | Ministry of Foreign Affairs (Taiwan) | Ministry of Foreign Affairs of the People's Re... |
13 | Shen Guofang | Guofang Wei | Zeng Guofan | Zeng Guofan |
14 | Reuters | Reuters | Reuters | Reuters |
15 | Taiwan | Taiwan | Taiwan | Taiwan |
16 | China | Mainland China | China | China |
17 | Taipei | Taipei | Taipei | Taipei |
18 | Tang Shubei | Qiandao Lake incident | Tang Fei | Tang Fei |
19 | Taiwan | Taiwan | Taiwan | Taiwan |
20 | People 's Daily | People's Daily | People's Daily | People's Daily |
21 | Tang | Tangs | Tāng (surname) | Tang dynasty |
22 | Shen | O-Shen | Shen Chang-huan | Shen Chang-huan |
23 | Reuters Television | Reuters TV | Reuters TV | Reuters TV |
24 | Tang | Tangs | Ignacio Milam Tang | Tang dynasty |
25 | Taiwan | Taiwan | Taiwan | Taiwan |
26 | Beijing | Beijing | Beijing | Beijing |
27 | China | Mainland China | China | China |
28 | Taipei | Taipei | Taipei | Taipei |
29 | Ukraine | Ukraine | Ukraine | Ukraine |
30 | Taiwanese | Han Taiwanese | Taiwan | Taiwan |
31 | Lien | Tax lien | Lien Chan | Lien Chan |
The Naive approach wrongly links "China" to the "Mainland China". With "Mainland China" the part of China not including the Republic of China controlling Kinmen, Matsu, and the Pescadores. This is crearly a mistake, as the text refers to the Republic of China, as correctly linked by the other two approaches.
For all the other Geografic locations none of the NEL approaches commits any mistakes (Taiwan, Ukraine, Taipei, Taiwan Strait, ...). A big distinction between the naive approach and the other two comes though from the linking of Nationalities:
For what regards people:
Another interesting entity is "Foreign Ministry", where the ability of the three approach to understand based on context is really handy.
start = []
for ann in gdocBi.annset('entities_merged'):
start.append(ann.start)
df.index = start
df.head()
names | NaiveCandidate | BiCandidate | BlinkCandidate | |
---|---|---|---|---|
0 | China | Mainland China | China | China |
11 | Taiwan | Taiwan | Taiwan | Taiwan |
49 | BEIJING | Beijing | Beijing | Beijing |
69 | China | Mainland China | China | China |
95 | Taipei | Taipei | Taipei | Taipei |
In this case it does not make sense to calculate accuracy and recall because they are the same thing. There are no "false positives" or "true negative" since the golden standard is something that is always correct and true.
correct = 0
for gold_ann in gold_annset:
# get the title found by the NEL approach
candidate = df.loc[gold_ann.start]['NaiveCandidate']
# get the title of the link found by the golden standard
goldTitle = gold_ann.features['link'].split("/")[-1].replace("_", " ")
if candidate == goldTitle:
correct += 1
accuracy = correct / len(gold_annset)
accuracy
0.3125
true_positive = 0
false_negative = 0
for gold_ann in gold_annset:
# get the title found by the NEL approach
candidate = df.loc[gold_ann.start]['NaiveCandidate']
# get the title of the link found by the golden standard
goldTitle = gold_ann.features['link'].split("/")[-1].replace("_", " ")
if candidate == goldTitle:
true_positive += 1
else:
false_negative += 1
exact_recall = true_positive / (true_positive + false_negative)
exact_recall
0.3125
correct = 0
for gold_ann in gold_annset:
# get the title found by the NEL approach
candidate = df.loc[gold_ann.start]['BiCandidate']
# get the title of the link found by the golden standard
goldTitle = gold_ann.features['link'].split("/")[-1].replace("_", " ")
if candidate == goldTitle:
correct += 1
accuracy = correct / len(gold_annset)
accuracy
0.3125
correct = 0
for gold_ann in gold_annset:
# get the title found by the NEL approach
candidate = df.loc[gold_ann.start]['BlinkCandidate']
# get the title of the link found by the golden standard
goldTitle = gold_ann.features['link'].split("/")[-1].replace("_", " ")
if candidate == goldTitle:
correct += 1
accuracy = correct / len(gold_annset)
accuracy
0.34375