import json
# JSON file
f = open ('AIDA-YAGO2_2_GATE.json', "r")
  
# Reading from file
data = json.loads(f.read())


txt = data['text']
txt

'China says Taiwan spoils atmosphere for talks . \nBEIJING 1996-08-22 \nChina on Thursday accused Taipei of spoiling the atmosphere for a resumption of talks across the Taiwan Strait with a visit to Ukraine by Taiwanese Vice President Lien Chan this week that infuriated Beijing . \nSpeaking only hours after Chinese state media said the time was right to engage in political talks with Taiwan , Foreign Ministry spokesman Shen Guofang told Reuters : " The necessary atmosphere for the opening of the talks has been disrupted by the Taiwan authorities . " \nState media quoted China \'s top negotiator with Taipei , Tang Shubei , as telling a visiting group from Taiwan on Wednesday that it was time for the rivals to hold political talks . \n" Now is the time for the two sides to engage in political talks ... \nthat is to end the state of hostility , " Thursday \'s overseas edition of the People \'s Daily quoted Tang as saying . \nThe foreign ministry \'s Shen told Reuters Television in an interview he had read reports of Tang \'s comments but gave no details of why the negotiator had considered the time right for talks with Taiwan , which Beijing considers a renegade province . \nChina , which has long opposed all Taipei efforts to gain greater international recognition , was infuriated by a visit to Ukraine this week by Taiwanese Vice President Lien . '


import spacy
from spacy.cli import download as spacy_download

spacy_model = 'en_core_web_sm'
spacy_download(spacy_model)
ner = spacy.load(spacy_model)

✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


# Perform NER
doc_spacy = ner(txt)


from spacy import displacy
displacy.render(doc_spacy, style='ent', jupyter=True)


from gatenlp import Document

# Open and display the golden standard
with open('AIDA-YAGO2_2_GATE.json', 'r') as inp:
    gold_dict = json.load(inp)

gold_annotations_doc = Document.from_dict(gold_dict)
gold_annotations_doc


doc2 = Document(txt)
annset_ner = doc2.annset('spacy')

# add annotations from spacy to gatenlp
for ann in doc_spacy.ents:
    annset_ner.add(ann.start_char, ann.end_char, ann.label_) #add the annotations to GateNLP Document


# Display the performed NER
eval_annotations_doc = Document.from_dict(doc2.to_dict())
eval_annotations_doc


gold_annset = gold_annotations_doc.annset('gold')
to_eval_annset = eval_annotations_doc.annset('spacy')


true_positive = 0
false_positive = 0

for ann in to_eval_annset:
    gold_ann_matched = list(gold_annset.within((ann.start, ann.end)))
    if len(gold_ann_matched) == 1 and ann.start == gold_ann_matched[0].start and ann.end == gold_ann_matched[0].end:
        true_positive += 1
    else:
        false_positive += 1
    
exact_precision = true_positive / (true_positive + false_positive)
exact_precision

0.7317073170731707


true_positive = 0
false_negative = 0

for gold_ann in gold_annset:
    to_eval_ann_matched = list(to_eval_annset.within((gold_ann.start, gold_ann.end)))
    if len(to_eval_ann_matched) == 1 and gold_ann.start == to_eval_ann_matched[0].start and gold_ann.end == to_eval_ann_matched[0].end:
        true_positive += 1
    else:
        false_negative += 1
    
exact_recall = true_positive / (true_positive + false_negative)
exact_recall

0.9375


true_positive = 0
false_positive = 0

for ann in to_eval_annset:
    gold_ann_matched = list(gold_annset.within((ann.start, ann.end)))
    if len(gold_ann_matched) == 1 and ann.end == gold_ann_matched[0].end:
        true_positive += 1
    else:
        false_positive += 1
    
right_precision = true_positive / (true_positive + false_positive)
right_precision

0.7804878048780488


true_positive = 0
false_negative = 0

for gold_ann in gold_annset:
    to_eval_ann_matched = list(to_eval_annset.within((gold_ann.start, gold_ann.end)))
    if len(to_eval_ann_matched) == 1 and gold_ann.end == to_eval_ann_matched[0].end:
        true_positive += 1
    else:
        false_negative += 1
    
right_recall = true_positive / (true_positive + false_negative)
right_recall

0.9375


true_positive = 0
false_positive = 0

for ann in to_eval_annset:
    gold_ann_matched = list(gold_annset.within((ann.start, ann.end)))
    if len(gold_ann_matched) != 1:
        false_positive += 1
        continue
    # Build two lists that contain the numbers corresponding to the entity position, bot for the golden standard and our NER
    ListG = [*range(gold_ann_matched[0].start, gold_ann_matched[0].end)]
    ListE = [*range(ann.start, ann.end)]
    # If the two lists share at leas a number then NER found at least a fragment of the gold annotation
    if any(item in ListE for item in ListG):
        true_positive += 1
    else:
        false_positive += 1
    
partial_precision = true_positive / (true_positive + false_positive)
partial_precision

0.7804878048780488


true_positive = 0
false_negative = 0

for gold_ann in gold_annset:
    to_eval_ann_matched = list(to_eval_annset.within((gold_ann.start, gold_ann.end)))
    ListG = [*range(gold_ann.start, gold_ann.end)]
    if len(to_eval_ann_matched) != 1:
        false_negative += 1
        continue
    ListE = [*range(to_eval_ann_matched[0].start, to_eval_ann_matched[0].end)]
    if any(item in ListE for item in ListG):
        true_positive += 1
    else:
        false_negative += 1
    
partial_recall = true_positive / (true_positive + false_negative)
partial_recall

0.9375


print('------------------ GOLD ANNOTATION SET -----------------')
for ann in gold_annset:
    print({'start':ann.start, 'end':ann.end, 'type':ann.type, 'id':ann.id, 'mention':txt[ann.start:ann.end]})

print('------------------ SPACY ANNOTATION SET -----------------')
for ann in to_eval_annset:
    print({'start':ann.start, 'end':ann.end, 'type':ann.type, 'id':ann.id, 'mention':txt[ann.start:ann.end]})

------------------ GOLD ANNOTATION SET -----------------
{'start': 0, 'end': 5, 'type': 'LOC', 'id': 0, 'mention': 'China'}
{'start': 11, 'end': 17, 'type': 'LOC', 'id': 1, 'mention': 'Taiwan'}
{'start': 49, 'end': 56, 'type': 'LOC', 'id': 2, 'mention': 'BEIJING'}
{'start': 69, 'end': 74, 'type': 'LOC', 'id': 3, 'mention': 'China'}
{'start': 95, 'end': 101, 'type': 'LOC', 'id': 4, 'mention': 'Taipei'}
{'start': 166, 'end': 179, 'type': 'LOC', 'id': 5, 'mention': 'Taiwan Strait'}
{'start': 196, 'end': 203, 'type': 'LOC', 'id': 6, 'mention': 'Ukraine'}
{'start': 207, 'end': 216, 'type': 'MISC', 'id': 7, 'mention': 'Taiwanese'}
{'start': 232, 'end': 241, 'type': 'PER', 'id': 8, 'mention': 'Lien Chan'}
{'start': 268, 'end': 275, 'type': 'LOC', 'id': 9, 'mention': 'Beijing'}
{'start': 305, 'end': 312, 'type': 'MISC', 'id': 10, 'mention': 'Chinese'}
{'start': 383, 'end': 389, 'type': 'LOC', 'id': 11, 'mention': 'Taiwan'}
{'start': 392, 'end': 408, 'type': 'ORG', 'id': 12, 'mention': 'Foreign Ministry'}
{'start': 419, 'end': 431, 'type': 'PER', 'id': 13, 'mention': 'Shen Guofang'}
{'start': 437, 'end': 444, 'type': 'ORG', 'id': 14, 'mention': 'Reuters'}
{'start': 529, 'end': 535, 'type': 'LOC', 'id': 15, 'mention': 'Taiwan'}
{'start': 572, 'end': 577, 'type': 'LOC', 'id': 16, 'mention': 'China'}
{'start': 601, 'end': 607, 'type': 'LOC', 'id': 17, 'mention': 'Taipei'}
{'start': 610, 'end': 621, 'type': 'PER', 'id': 18, 'mention': 'Tang Shubei'}
{'start': 657, 'end': 663, 'type': 'LOC', 'id': 19, 'mention': 'Taiwan'}
{'start': 884, 'end': 899, 'type': 'ORG', 'id': 20, 'mention': "People 's Daily"}
{'start': 907, 'end': 911, 'type': 'PER', 'id': 21, 'mention': 'Tang'}
{'start': 949, 'end': 953, 'type': 'ORG', 'id': 22, 'mention': 'Shen'}
{'start': 959, 'end': 977, 'type': 'ORG', 'id': 23, 'mention': 'Reuters Television'}
{'start': 1017, 'end': 1021, 'type': 'PER', 'id': 24, 'mention': 'Tang'}
{'start': 1121, 'end': 1127, 'type': 'LOC', 'id': 25, 'mention': 'Taiwan'}
{'start': 1136, 'end': 1143, 'type': 'LOC', 'id': 26, 'mention': 'Beijing'}
{'start': 1177, 'end': 1182, 'type': 'LOC', 'id': 27, 'mention': 'China'}
{'start': 1212, 'end': 1218, 'type': 'LOC', 'id': 28, 'mention': 'Taipei'}
{'start': 1300, 'end': 1307, 'type': 'LOC', 'id': 29, 'mention': 'Ukraine'}
{'start': 1321, 'end': 1330, 'type': 'MISC', 'id': 30, 'mention': 'Taiwanese'}
{'start': 1346, 'end': 1350, 'type': 'PER', 'id': 31, 'mention': 'Lien'}
------------------ SPACY ANNOTATION SET -----------------
{'start': 0, 'end': 5, 'type': 'GPE', 'id': 0, 'mention': 'China'}
{'start': 11, 'end': 17, 'type': 'GPE', 'id': 1, 'mention': 'Taiwan'}
{'start': 49, 'end': 56, 'type': 'GPE', 'id': 2, 'mention': 'BEIJING'}
{'start': 57, 'end': 67, 'type': 'DATE', 'id': 3, 'mention': '1996-08-22'}
{'start': 69, 'end': 74, 'type': 'GPE', 'id': 4, 'mention': 'China'}
{'start': 78, 'end': 86, 'type': 'DATE', 'id': 5, 'mention': 'Thursday'}
{'start': 95, 'end': 101, 'type': 'GPE', 'id': 6, 'mention': 'Taipei'}
{'start': 162, 'end': 179, 'type': 'LOC', 'id': 7, 'mention': 'the Taiwan Strait'}
{'start': 196, 'end': 203, 'type': 'GPE', 'id': 8, 'mention': 'Ukraine'}
{'start': 207, 'end': 216, 'type': 'NORP', 'id': 9, 'mention': 'Taiwanese'}
{'start': 232, 'end': 241, 'type': 'PERSON', 'id': 10, 'mention': 'Lien Chan'}
{'start': 242, 'end': 251, 'type': 'DATE', 'id': 11, 'mention': 'this week'}
{'start': 268, 'end': 275, 'type': 'GPE', 'id': 12, 'mention': 'Beijing'}
{'start': 288, 'end': 298, 'type': 'TIME', 'id': 13, 'mention': 'only hours'}
{'start': 305, 'end': 312, 'type': 'NORP', 'id': 14, 'mention': 'Chinese'}
{'start': 383, 'end': 389, 'type': 'GPE', 'id': 15, 'mention': 'Taiwan'}
{'start': 392, 'end': 408, 'type': 'ORG', 'id': 16, 'mention': 'Foreign Ministry'}
{'start': 419, 'end': 431, 'type': 'PERSON', 'id': 17, 'mention': 'Shen Guofang'}
{'start': 437, 'end': 444, 'type': 'ORG', 'id': 18, 'mention': 'Reuters'}
{'start': 529, 'end': 535, 'type': 'GPE', 'id': 19, 'mention': 'Taiwan'}
{'start': 553, 'end': 558, 'type': 'ORG', 'id': 20, 'mention': 'State'}
{'start': 572, 'end': 577, 'type': 'GPE', 'id': 21, 'mention': 'China'}
{'start': 601, 'end': 607, 'type': 'GPE', 'id': 22, 'mention': 'Taipei'}
{'start': 610, 'end': 621, 'type': 'GPE', 'id': 23, 'mention': 'Tang Shubei'}
{'start': 657, 'end': 663, 'type': 'GPE', 'id': 24, 'mention': 'Taiwan'}
{'start': 667, 'end': 676, 'type': 'DATE', 'id': 25, 'mention': 'Wednesday'}
{'start': 762, 'end': 765, 'type': 'CARDINAL', 'id': 26, 'mention': 'two'}
{'start': 848, 'end': 856, 'type': 'DATE', 'id': 27, 'mention': 'Thursday'}
{'start': 880, 'end': 899, 'type': 'ORG', 'id': 28, 'mention': "the People 's Daily"}
{'start': 907, 'end': 911, 'type': 'PERSON', 'id': 29, 'mention': 'Tang'}
{'start': 949, 'end': 953, 'type': 'ORG', 'id': 30, 'mention': 'Shen'}
{'start': 959, 'end': 977, 'type': 'ORG', 'id': 31, 'mention': 'Reuters Television'}
{'start': 1017, 'end': 1021, 'type': 'PERSON', 'id': 32, 'mention': 'Tang'}
{'start': 1121, 'end': 1127, 'type': 'GPE', 'id': 33, 'mention': 'Taiwan'}
{'start': 1136, 'end': 1143, 'type': 'GPE', 'id': 34, 'mention': 'Beijing'}
{'start': 1177, 'end': 1182, 'type': 'GPE', 'id': 35, 'mention': 'China'}
{'start': 1212, 'end': 1218, 'type': 'GPE', 'id': 36, 'mention': 'Taipei'}
{'start': 1300, 'end': 1307, 'type': 'GPE', 'id': 37, 'mention': 'Ukraine'}
{'start': 1308, 'end': 1317, 'type': 'DATE', 'id': 38, 'mention': 'this week'}
{'start': 1321, 'end': 1330, 'type': 'NORP', 'id': 39, 'mention': 'Taiwanese'}
{'start': 1346, 'end': 1350, 'type': 'PERSON', 'id': 40, 'mention': 'Lien'}


# Print all the entities that have a different type between NER and the golden standard
for ann in to_eval_annset:
    gold_ann_matched = list(gold_annset.within((ann.start, ann.end)))
    if len(gold_ann_matched) == 1 and ann.type != gold_ann_matched[0].type:
        print({'mention': txt[ann.start:ann.end], 'spacy_type': ann.type, 'AIDA_type': gold_ann_matched[0].type})

{'mention': 'China', 'spacy_type': 'GPE', 'AIDA_type': 'LOC'}
{'mention': 'Taiwan', 'spacy_type': 'GPE', 'AIDA_type': 'LOC'}
{'mention': 'BEIJING', 'spacy_type': 'GPE', 'AIDA_type': 'LOC'}
{'mention': 'China', 'spacy_type': 'GPE', 'AIDA_type': 'LOC'}
{'mention': 'Taipei', 'spacy_type': 'GPE', 'AIDA_type': 'LOC'}
{'mention': 'Ukraine', 'spacy_type': 'GPE', 'AIDA_type': 'LOC'}
{'mention': 'Taiwanese', 'spacy_type': 'NORP', 'AIDA_type': 'MISC'}
{'mention': 'Lien Chan', 'spacy_type': 'PERSON', 'AIDA_type': 'PER'}
{'mention': 'Beijing', 'spacy_type': 'GPE', 'AIDA_type': 'LOC'}
{'mention': 'Chinese', 'spacy_type': 'NORP', 'AIDA_type': 'MISC'}
{'mention': 'Taiwan', 'spacy_type': 'GPE', 'AIDA_type': 'LOC'}
{'mention': 'Shen Guofang', 'spacy_type': 'PERSON', 'AIDA_type': 'PER'}
{'mention': 'Taiwan', 'spacy_type': 'GPE', 'AIDA_type': 'LOC'}
{'mention': 'China', 'spacy_type': 'GPE', 'AIDA_type': 'LOC'}
{'mention': 'Taipei', 'spacy_type': 'GPE', 'AIDA_type': 'LOC'}
{'mention': 'Tang Shubei', 'spacy_type': 'GPE', 'AIDA_type': 'PER'}
{'mention': 'Taiwan', 'spacy_type': 'GPE', 'AIDA_type': 'LOC'}
{'mention': 'Tang', 'spacy_type': 'PERSON', 'AIDA_type': 'PER'}
{'mention': 'Tang', 'spacy_type': 'PERSON', 'AIDA_type': 'PER'}
{'mention': 'Taiwan', 'spacy_type': 'GPE', 'AIDA_type': 'LOC'}
{'mention': 'Beijing', 'spacy_type': 'GPE', 'AIDA_type': 'LOC'}
{'mention': 'China', 'spacy_type': 'GPE', 'AIDA_type': 'LOC'}
{'mention': 'Taipei', 'spacy_type': 'GPE', 'AIDA_type': 'LOC'}
{'mention': 'Ukraine', 'spacy_type': 'GPE', 'AIDA_type': 'LOC'}
{'mention': 'Taiwanese', 'spacy_type': 'NORP', 'AIDA_type': 'MISC'}
{'mention': 'Lien', 'spacy_type': 'PERSON', 'AIDA_type': 'PER'}


# Build a dictionary that stores the possible conversions
conversionMap = {}
for ann in to_eval_annset:
    gold_ann_matched = list(gold_annset.within((ann.start, ann.end)))
    if len(gold_ann_matched) == 1 and ann.type != gold_ann_matched[0].type:
        # If the name of the type is already inside the dictionary...
        if str(ann.type) in conversionMap:
            # ... if the value is the same, then ignore it (the information is already stored)
            if conversionMap[ann.type] == gold_ann_matched[0].type:
                continue
            # ... if the value is different, then plot a warning. We will have to check which one of the two types is an error
            else:
                print(ann.type + 'type has more than one corrispondent AIDA types: ' + gold_ann_matched[0].type + ' and ' + conversionMap[ann.type])
        # If the name of the type is not already inside the dictionary, then add it
        else:
            conversionMap[ann.type] = gold_ann_matched[0].type

conversionMap

GPEtype has more than one corrispondent AIDA types: PER and LOC

{'GPE': 'LOC', 'NORP': 'MISC', 'PERSON': 'PER'}


true_positive = 0
false_positive = 0

for ann in to_eval_annset:
    gold_ann_matched = list(gold_annset.within((ann.start, ann.end)))
    if len(gold_ann_matched) == 1 and ann.type == gold_ann_matched[0].type: # control on type added
        true_positive += 1
    else:
        false_positive += 1
        
true_positive = 0
false_negative = 0

for gold_ann in gold_annset:
    to_eval_ann_matched = list(to_eval_annset.within((gold_ann.start, gold_ann.end)))
    if len(to_eval_ann_matched) == 1 and gold_ann.type == to_eval_ann_matched[0].type:
        true_positive += 1
    else:
        false_negative += 1


print('True Positive = ' + str(true_positive))
print('False Positive = ' + str(false_positive))
print('False Negative = ' + str(false_negative))

True Positive = 4
False Positive = 35
False Negative = 28


# Add every other type to the conversion matrix. (set key and value equal)
# In this way, the for loop will always found a corresponding key in the conversion dictionary.
for ann in to_eval_annset:
    if ann.type not in conversionMap:
        conversionMap[ann.type] = ann.type


true_positive = 0
false_positive = 0

for ann in to_eval_annset:
    gold_ann_matched = list(gold_annset.within((ann.start, ann.end)))
    if len(gold_ann_matched) == 1 and conversionMap[ann.type] == gold_ann_matched[0].type: # control on type added
        true_positive += 1
    else:
        false_positive += 1
        
true_positive = 0
false_negative = 0

for gold_ann in gold_annset:
    to_eval_ann_matched = list(to_eval_annset.within((gold_ann.start, gold_ann.end)))
    if len(to_eval_ann_matched) == 1 and gold_ann.type == conversionMap[to_eval_ann_matched[0].type]:
        true_positive += 1
    else:
        false_negative += 1


print('True Positive = ' + str(true_positive))
print('False Positive = ' + str(false_positive))
print('False Negative = ' + str(false_negative))

True Positive = 29
False Positive = 10
False Negative = 3


true_positive = 0
false_positive = 0

for ann in to_eval_annset:
    gold_ann_matched = list(gold_annset.within((ann.start, ann.end)))
    if len(gold_ann_matched) == 1 \
    and ann.start == gold_ann_matched[0].start \
    and ann.end == gold_ann_matched[0].end \
    and ann.type == gold_ann_matched[0].type: # control on type added
        true_positive += 1
    else:
        false_positive += 1
    
exact_typed_precision = true_positive / (true_positive + false_positive)
exact_typed_precision

0.0975609756097561


true_positive = 0
false_negative = 0

for gold_ann in gold_annset:
    to_eval_ann_matched = list(to_eval_annset.within((gold_ann.start, gold_ann.end)))
    if len(to_eval_ann_matched) == 1 \
    and gold_ann.start == to_eval_ann_matched[0].start \
    and gold_ann.end == to_eval_ann_matched[0].end \
    and gold_ann.type == to_eval_ann_matched[0].type:
        true_positive += 1
    else:
        false_negative += 1
    
exact_typed_recall = true_positive / (true_positive + false_negative)
exact_typed_recall

0.125


true_positive = 0
false_positive = 0

for ann in to_eval_annset:
    gold_ann_matched = list(gold_annset.within((ann.start, ann.end)))
    if len(gold_ann_matched) == 1 \
    and ann.start == gold_ann_matched[0].start \
    and ann.end == gold_ann_matched[0].end \
    and conversionMap[ann.type] == gold_ann_matched[0].type: # control on type added
        true_positive += 1
    else:
        false_positive += 1
    
exact_typed_precision = true_positive / (true_positive + false_positive)
exact_typed_precision

0.7073170731707317


true_positive = 0
false_negative = 0

for gold_ann in gold_annset:
    to_eval_ann_matched = list(to_eval_annset.within((gold_ann.start, gold_ann.end)))
    if len(to_eval_ann_matched) == 1 \
    and gold_ann.start == to_eval_ann_matched[0].start \
    and gold_ann.end == to_eval_ann_matched[0].end \
    and gold_ann.type == conversionMap[to_eval_ann_matched[0].type]:
        true_positive += 1
    else:
        false_negative += 1
    
exact_typed_recall = true_positive / (true_positive + false_negative)
exact_typed_recall

0.90625


# Display the performed NER
eval_annotations_doc = Document.from_dict(doc2.to_dict())
eval_annotations_doc

txt

'China says Taiwan spoils atmosphere for talks . \nBEIJING 1996-08-22 \nChina on Thursday accused Taipei of spoiling the atmosphere for a resumption of talks across the Taiwan Strait with a visit to Ukraine by Taiwanese Vice President Lien Chan this week that infuriated Beijing . \nSpeaking only hours after Chinese state media said the time was right to engage in political talks with Taiwan , Foreign Ministry spokesman Shen Guofang told Reuters : " The necessary atmosphere for the opening of the talks has been disrupted by the Taiwan authorities . " \nState media quoted China \'s top negotiator with Taipei , Tang Shubei , as telling a visiting group from Taiwan on Wednesday that it was time for the rivals to hold political talks . \n" Now is the time for the two sides to engage in political talks ... \nthat is to end the state of hostility , " Thursday \'s overseas edition of the People \'s Daily quoted Tang as saying . \nThe foreign ministry \'s Shen told Reuters Television in an interview he had read reports of Tang \'s comments but gave no details of why the negotiator had considered the time right for talks with Taiwan , which Beijing considers a renegade province . \nChina , which has long opposed all Taipei efforts to gain greater international recognition , was infuriated by a visit to Ukraine this week by Taiwanese Vice President Lien . '


import spacy_transformers
import spacy_dbpedia_spotlight


ner.add_pipe('dbpedia_spotlight')
ner.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x1e6ecea7e80>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x1e6ecea7d00>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x1e6ecd12d60>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x1e6ecf1b840>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x1e6ece9dd00>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x1e6ecd12cf0>),
 ('dbpedia_spotlight',
  <spacy_dbpedia_spotlight.entity_linker.EntityLinker at 0x1e6eea195b0>)]


doc_spotlight = ner(txt)
doc_spotlight.ents

(China,
 Taiwan,
 BEIJING,
 China,
 Taipei,
 Taiwan Strait,
 Ukraine,
 Taiwanese,
 Vice President,
 Chan,
 Beijing,
 Chinese,
 Taiwan,
 Foreign Ministry,
 Reuters,
 Taiwan,
 China,
 Taipei,
 Tang,
 Taiwan,
 Tang,
 Reuters,
 Tang,
 Taiwan,
 Beijing,
 China,
 Taipei,
 Ukraine,
 Taiwanese,
 Vice President)


import pandas as pd
# access dbpedia links using `ent.kb_id_`
pd.DataFrame(
    [(ent.text, ent.kb_id_, ent._.dbpedia_raw_result['@similarityScore']) for ent in doc_spotlight.ents],
    columns = ['mention', 'link', 'similarity']
)


import wikipedia


def generate_candidates_wikipedia(mentions):
    candidates = {} # {mention1: [cand11, cand12], mention2: [cand21, cand22]}
    for mention in mentions:
        # search gives title of most relevant pages
        pages = wikipedia.search(mention, results = 10)
        filtered_pages = []
        for title in pages:
            try:
                # use page to get the page 
                page = wikipedia.page(title)
                filtered_pages.append(page)
            except wikipedia.DisambiguationError:
                # filter out disambiguation pages
                pass
            except wikipedia.PageError:
                # ignore pageerrors
                pass
        candidates[mention] = filtered_pages
    return candidates


# get the list of mentions from the annset
mentions = [txt[ann.start:ann.end] for ann in gold_annotations_doc.annset('gold')] # in this example the gold annotations are used
mentions

['China',
 'Taiwan',
 'BEIJING',
 'China',
 'Taipei',
 'Taiwan Strait',
 'Ukraine',
 'Taiwanese',
 'Lien Chan',
 'Beijing',
 'Chinese',
 'Taiwan',
 'Foreign Ministry',
 'Shen Guofang',
 'Reuters',
 'Taiwan',
 'China',
 'Taipei',
 'Tang Shubei',
 'Taiwan',
 "People 's Daily",
 'Tang',
 'Shen',
 'Reuters Television',
 'Tang',
 'Taiwan',
 'Beijing',
 'China',
 'Taipei',
 'Ukraine',
 'Taiwanese',
 'Lien']


candidates = generate_candidates_wikipedia(mentions) # set removes duplicates


from pprint import pprint # pretty print
pprint(candidates)

{'BEIJING': [<WikipediaPage 'Beijing'>,
             <WikipediaPage 'Beijing Radio and Television Station'>,
             <WikipediaPage 'BAIC Group'>,
             <WikipediaPage '2022 Winter Olympics'>,
             <WikipediaPage 'Politics of Beijing'>,
             <WikipediaPage 'Beijing Guoan F.C.'>,
             <WikipediaPage 'Beijing Subway'>,
             <WikipediaPage '2012 Summer Olympics'>,
             <WikipediaPage '1989 Tiananmen Square protests and massacre'>],
 'Beijing': [<WikipediaPage 'Beijing'>,
             <WikipediaPage 'Beijing Radio and Television Station'>,
             <WikipediaPage 'BAIC Group'>,
             <WikipediaPage '2022 Winter Olympics'>,
             <WikipediaPage 'Politics of Beijing'>,
             <WikipediaPage 'Beijing Subway'>,
             <WikipediaPage '2012 Summer Olympics'>,
             <WikipediaPage 'G4 Beijing–Hong Kong and Macau Expressway'>,
             <WikipediaPage '1989 Tiananmen Square protests and massacre'>],
 'China': [<WikipediaPage 'Taiwan'>,
           <WikipediaPage 'History of China'>,
           <WikipediaPage 'Republic of China (1912–1949)'>,
           <WikipediaPage 'Chinese language'>,
           <WikipediaPage 'Mainland China'>,
           <WikipediaPage 'Chinese people'>,
           <WikipediaPage 'List of regions of China'>],
 'Chinese': [<WikipediaPage 'Chinese language'>,
             <WikipediaPage 'History of China'>,
             <WikipediaPage 'Chinese characters'>,
             <WikipediaPage 'Rabbit (zodiac)'>,
             <WikipediaPage 'Standard Chinese'>,
             <WikipediaPage 'Mandarin Chinese'>,
             <WikipediaPage 'Taiwan'>],
 'Foreign Ministry': [<WikipediaPage 'Ministry of foreign affairs'>,
                      <WikipediaPage 'Ministry of Foreign Affairs (Israel)'>,
                      <WikipediaPage 'Ministry for Europe and Foreign Affairs (France)'>,
                      <WikipediaPage 'Ministry of Foreign Affairs (Russia)'>,
                      <WikipediaPage 'Ministry of Foreign Affairs of the People's Republic of China'>,
                      <WikipediaPage 'Ministry of Foreign Affairs (Afghanistan)'>,
                      <WikipediaPage 'Foreign, Commonwealth and Development Office'>,
                      <WikipediaPage 'Ministry of Foreign Affairs (Japan)'>,
                      <WikipediaPage 'Ministry of Foreign Affairs (Pakistan)'>,
                      <WikipediaPage 'Ministry of Foreign Affairs (Ukraine)'>],
 'Lien': [<WikipediaPage 'Jennifer Lien'>,
          <WikipediaPage 'University of Pau and the Adour Region'>,
          <WikipediaPage 'Mechanic's lien'>,
          <WikipediaPage 'Tax lien'>,
          <WikipediaPage 'Phương Liên'>,
          <WikipediaPage 'Đinh dynasty'>,
          <WikipediaPage 'Crouching Tiger, Hidden Dragon'>,
          <WikipediaPage 'Banker's lien'>],
 'Lien Chan': [<WikipediaPage 'Lien Chan'>,
               <WikipediaPage 'Ma Ying-jeou'>,
               <WikipediaPage 'Kuomintang'>,
               <WikipediaPage '2005 Pan–Blue visits to mainland China'>,
               <WikipediaPage '2000 Taiwanese presidential election'>,
               <WikipediaPage 'Lee Teng-hui'>,
               <WikipediaPage '2004 Taiwanese presidential election'>,
               <WikipediaPage '1996 Taiwanese presidential election'>,
               <WikipediaPage 'Vincent Siew'>],
 "People 's Daily": [<WikipediaPage 'People's Daily'>,
                     <WikipediaPage 'People's Court Daily'>,
                     <WikipediaPage 'People's World'>,
                     <WikipediaPage 'People's Liberation Army Daily'>,
                     <WikipediaPage 'Dziennik Ludowy (People's Daily)'>,
                     <WikipediaPage 'The Daily Show'>,
                     <WikipediaPage 'The Daily Telegraph'>,
                     <WikipediaPage 'Everyday life'>],
 'Reuters': [<WikipediaPage 'Reuters'>,
             <WikipediaPage 'Thomson Reuters'>,
             <WikipediaPage 'Fritz Reuter'>,
             <WikipediaPage 'Paul Reuter'>,
             <WikipediaPage 'Suzanne Reuter'>,
             <WikipediaPage 'Reuters Group'>,
             <WikipediaPage 'Ernst Reuter'>,
             <WikipediaPage 'Reuter concession'>,
             <WikipediaPage 'Saudi Arabian–led intervention in Yemen'>],
 'Reuters Television': [<WikipediaPage 'Reuters'>,
                        <WikipediaPage 'Thomson Reuters'>,
                        <WikipediaPage 'Reuters Group'>,
                        <WikipediaPage 'Reuters TV'>,
                        <WikipediaPage 'The Refinitiv Business Classification'>,
                        <WikipediaPage '2017 in American television'>,
                        <WikipediaPage 'Suzanne Reuter'>,
                        <WikipediaPage 'Jules Asner'>,
                        <WikipediaPage 'Christine Romans'>],
 'Shen': [<WikipediaPage 'Shěn'>,
          <WikipediaPage 'Shen Changyin and Shen Changping'>,
          <WikipediaPage 'Shen Teng'>,
          <WikipediaPage 'Shen Congwen'>,
          <WikipediaPage 'Shen Quan'>,
          <WikipediaPage 'O-Shen'>,
          <WikipediaPage 'Shen Gongbao'>],
 'Shen Guofang': [<WikipediaPage 'Murray's law'>,
                  <WikipediaPage 'Porous medium'>,
                  <WikipediaPage 'Guofang Wei'>,
                  <WikipediaPage 'List of members of the Chinese Academy of Engineering'>,
                  <WikipediaPage 'Ministry of Foreign Affairs of the People's Republic of China'>,
                  <WikipediaPage 'List of Foreign Ministry Spokespersons of the People's Republic of China'>,
                  <WikipediaPage 'Diagnostic delay'>,
                  <WikipediaPage 'National Defense Medical Center'>,
                  <WikipediaPage 'Central Military Commission (China)'>,
                  <WikipediaPage 'List of ships of the People's Liberation Army Navy'>],
 'Taipei': [<WikipediaPage 'Taipei'>,
            <WikipediaPage 'Taipei 101'>,
            <WikipediaPage 'Chinese Taipei'>,
            <WikipediaPage 'Taiwan'>,
            <WikipediaPage 'New Taipei City'>,
            <WikipediaPage 'Taipei Metro'>,
            <WikipediaPage '2023 Taipei Open'>,
            <WikipediaPage 'Taoyuan International Airport'>,
            <WikipediaPage 'Taipei Fubon Bank'>],
 'Taiwan': [<WikipediaPage 'Taiwan'>,
            <WikipediaPage 'Regions of Taiwan'>,
            <WikipediaPage 'History of Taiwan'>,
            <WikipediaPage 'Taiwanese people'>,
            <WikipediaPage 'Taiwanese indigenous peoples'>,
            <WikipediaPage 'Political status of Taiwan'>,
            <WikipediaPage 'Geography of Taiwan'>,
            <WikipediaPage 'Taiwan, China'>],
 'Taiwan Strait': [<WikipediaPage 'Taiwan Strait'>,
                   <WikipediaPage 'Third Taiwan Strait Crisis'>,
                   <WikipediaPage 'Second Taiwan Strait Crisis'>,
                   <WikipediaPage 'First Taiwan Strait Crisis'>,
                   <WikipediaPage 'Cross-Strait relations'>,
                   <WikipediaPage 'Political status of Taiwan'>,
                   <WikipediaPage 'Taiwan Strait Crises'>,
                   <WikipediaPage '2022 Chinese military exercises around Taiwan'>,
                   <WikipediaPage 'Geography of Taiwan'>,
                   <WikipediaPage 'Taiwan Strait Tunnel Project'>],
 'Taiwanese': [<WikipediaPage 'Taiwan'>,
               <WikipediaPage 'Taiwanese Hokkien'>,
               <WikipediaPage 'Taiwanese people'>,
               <WikipediaPage 'Taiwanese indigenous peoples'>,
               <WikipediaPage 'Taiwanese Americans'>,
               <WikipediaPage 'Taiwanese cuisine'>,
               <WikipediaPage 'Han Taiwanese'>,
               <WikipediaPage 'Taiwanese Hakka'>,
               <WikipediaPage 'Taiwanese Mandarin'>],
 'Tang': [<WikipediaPage 'Tank'>,
          <WikipediaPage 'Song dynasty'>,
          <WikipediaPage 'Wu-Tang Clan'>,
          <WikipediaPage 'Later Tang'>,
          <WikipediaPage 'Tang Wei'>,
          <WikipediaPage 'Tang Sanzang'>,
          <WikipediaPage 'Tang dynasty'>,
          <WikipediaPage 'Tang (drink mix)'>,
          <WikipediaPage 'Tangs'>],
 'Tang Shubei': [<WikipediaPage 'Qiandao Lake incident'>,
                 <WikipediaPage 'Fallen City'>],
 'Ukraine': [<WikipediaPage 'Ukraine'>,
             <WikipediaPage 'Russian invasion of Ukraine'>,
             <WikipediaPage 'Ukrainians'>,
             <WikipediaPage 'Ukrainian language'>,
             <WikipediaPage 'Armed Forces of Ukraine'>,
             <WikipediaPage 'Ukrainian Soviet Socialist Republic'>,
             <WikipediaPage '2023 Belgorod Oblast incursion'>,
             <WikipediaPage 'History of Ukraine'>]}


import textdistance
jaccard = textdistance.Jaccard(qval = 2)


candidate_rankings = {}
for mention, m_candidates in candidates.items():
    similarities = [
        jaccard.similarity(mention.lower(), m_cand.title.lower())
        for m_cand in m_candidates
    ]

    cand_and_sim = list(zip(m_candidates, similarities))
    cand_and_sim.sort(reverse=True, key=lambda x: x[1]) # sort descending by similarity

    candidate_rankings[mention] = cand_and_sim


best_candidates = {mention:cands[0] for mention, cands in candidate_rankings.items()}
pprint(best_candidates)

{'BEIJING': (<WikipediaPage 'Beijing'>, 1),
 'Beijing': (<WikipediaPage 'Beijing'>, 1),
 'China': (<WikipediaPage 'Mainland China'>, 0.3076923076923077),
 'Chinese': (<WikipediaPage 'Chinese language'>, 0.4),
 'Foreign Ministry': (<WikipediaPage 'Ministry of foreign affairs'>,
                      0.5185185185185185),
 'Lien': (<WikipediaPage 'Tax lien'>, 0.42857142857142855),
 'Lien Chan': (<WikipediaPage 'Lien Chan'>, 1),
 "People 's Daily": (<WikipediaPage 'People's Daily'>, 0.8),
 'Reuters': (<WikipediaPage 'Reuters'>, 1),
 'Reuters Television': (<WikipediaPage 'Reuters TV'>, 0.4444444444444444),
 'Shen': (<WikipediaPage 'O-Shen'>, 0.6),
 'Shen Guofang': (<WikipediaPage 'Guofang Wei'>, 0.4),
 'Taipei': (<WikipediaPage 'Taipei'>, 1),
 'Taiwan': (<WikipediaPage 'Taiwan'>, 1),
 'Taiwan Strait': (<WikipediaPage 'Taiwan Strait'>, 1),
 'Taiwanese': (<WikipediaPage 'Han Taiwanese'>, 0.6666666666666666),
 'Tang': (<WikipediaPage 'Tangs'>, 0.75),
 'Tang Shubei': (<WikipediaPage 'Qiandao Lake incident'>, 0.034482758620689655),
 'Ukraine': (<WikipediaPage 'Ukraine'>, 1)}


import requests
api_biencoder = 'https://vm.chronos.disco.unimib.it/api/blink/biencoder/mention/doc'
api_indexer = 'https://vm.chronos.disco.unimib.it/api/indexer/search/doc/10'
api_crossencoder = 'https://vm.chronos.disco.unimib.it/api/blink/crossencoder/doc/10'


gold_path = 'AIDA-YAGO2_2_GATE.json' # reassign this variable with your dataset

with open(gold_path, 'r') as inp:
    gold_dict = json.load(inp)

# rename needed to use the API
gold_dict['annotation_sets']['entities_merged'] = gold_dict['annotation_sets']['gold']
gold_dict['annotation_sets']['entities_merged']['name'] = 'entities_merged'
for ann in gold_dict['annotation_sets']['entities_merged']['annotations']:
    ann['features'] = {}

gold_annotations_doc = Document.from_dict(gold_dict)
gold_annotations_doc


import numpy as np
import base64

def vector_encode(v):
    s = base64.b64encode(v).decode()
    return s

def vector_decode(s, dtype=np.float32):
    buffer = base64.b64decode(s)
    v = np.frombuffer(buffer, dtype=dtype)
    return v


from requests.auth import HTTPBasicAuth
auth = HTTPBasicAuth('DS2023', 'eexeegheichai3OhChi5AhcheecaaShe')


gold_annotations_doc.features['pipeline'] = [] # force the API to run
gold_annotations_doc.features['pipeline']

[]


# call the Bi-Encoder only to encode the mentions and their contexts in a vect

res = requests.post(api_biencoder, auth = auth, json = gold_annotations_doc.to_dict())
res

<Response [200]>


assert res.ok
doc = res.json()
gdoc = Document.from_dict(doc)
gdoc


# obtain the candidates ranking through nearest neighbour using the vectors encoded by Bi-Encoder 

res = requests.post(api_indexer, auth = auth, json = gdoc.to_dict())


if res.ok:
    doc = res.json()
    gdocBi = Document.from_dict(doc)
else:
    print('error')
    print(res.content)


for ann in gdocBi.annset('entities_merged'):
    print(ann.features['linking']['top_candidate'])

{'raw_score': 326.15081787109375, 'id': 2397, 'wikipedia_id': 5405, 'wikidata_qid': None, 'redirects_to': None, 'title': 'China', 'url': 'https://en.wikipedia.org/wiki?curid=5405', 'type_': None, 'indexer': 0, 'score': 81.44474029541016, 'norm_score': 0.5004747453135283}
{'raw_score': 322.3394775390625, 'id': 12748, 'wikipedia_id': 25734, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Taiwan', 'url': 'https://en.wikipedia.org/wiki?curid=25734', 'type_': None, 'indexer': 0, 'score': 83.04270935058594, 'norm_score': 0.5179859472060483}
{'raw_score': 324.72314453125, 'id': 2301073, 'wikipedia_id': 18603746, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Beijing', 'url': 'https://en.wikipedia.org/wiki?curid=18603746', 'type_': None, 'indexer': 0, 'score': 82.0356674194336, 'norm_score': 0.4883515463892926}
{'raw_score': 325.8661193847656, 'id': 2397, 'wikipedia_id': 5405, 'wikidata_qid': None, 'redirects_to': None, 'title': 'China', 'url': 'https://en.wikipedia.org/wiki?curid=5405', 'type_': None, 'indexer': 0, 'score': 81.20738220214844, 'norm_score': 0.49901618910913265}
{'raw_score': 328.84820556640625, 'id': 28984, 'wikipedia_id': 57648, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Taipei', 'url': 'https://en.wikipedia.org/wiki?curid=57648', 'type_': None, 'indexer': 0, 'score': 79.6422348022461, 'norm_score': 0.47778394320151724}
{'raw_score': 334.675537109375, 'id': 42870, 'wikipedia_id': 91052, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Taiwan Strait', 'url': 'https://en.wikipedia.org/wiki?curid=91052', 'type_': None, 'indexer': 0, 'score': 82.00558471679688, 'norm_score': 0.496255599517426}
{'raw_score': 317.92822265625, 'id': 15756, 'wikipedia_id': 31750, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Ukraine', 'url': 'https://en.wikipedia.org/wiki?curid=31750', 'type_': None, 'indexer': 0, 'score': 87.6226806640625, 'norm_score': 0.554060116055794}
{'raw_score': 326.936767578125, 'id': 12748, 'wikipedia_id': 25734, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Taiwan', 'url': 'https://en.wikipedia.org/wiki?curid=25734', 'type_': None, 'indexer': 0, 'score': 81.32420349121094, 'norm_score': 0.5072666210628048}
{'raw_score': 339.0760192871094, 'id': 82451, 'wikipedia_id': 140551, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Lien Chan', 'url': 'https://en.wikipedia.org/wiki?curid=140551', 'type_': None, 'indexer': 0, 'score': 80.01026153564453, 'norm_score': 0.4225672789540661}
{'raw_score': 327.8848876953125, 'id': 2301073, 'wikipedia_id': 18603746, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Beijing', 'url': 'https://en.wikipedia.org/wiki?curid=18603746', 'type_': None, 'indexer': 0, 'score': 80.9153823852539, 'norm_score': 0.481682582193009}
{'raw_score': 326.45635986328125, 'id': 2397, 'wikipedia_id': 5405, 'wikidata_qid': None, 'redirects_to': None, 'title': 'China', 'url': 'https://en.wikipedia.org/wiki?curid=5405', 'type_': None, 'indexer': 0, 'score': 80.62527465820312, 'norm_score': 0.49543916100707935}
{'raw_score': 323.4985656738281, 'id': 12748, 'wikipedia_id': 25734, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Taiwan', 'url': 'https://en.wikipedia.org/wiki?curid=25734', 'type_': None, 'indexer': 0, 'score': 83.46719360351562, 'norm_score': 0.5206337037586383}
{'raw_score': 326.993896484375, 'id': 1574394, 'wikipedia_id': 10789929, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Ministry of Foreign Affairs (Taiwan)', 'url': 'https://en.wikipedia.org/wiki?curid=10789929', 'type_': None, 'indexer': 0, 'score': 82.36274719238281, 'norm_score': 0.49877553104761246}
{'raw_score': 337.355712890625, 'id': 189629, 'wikipedia_id': 457455, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Zeng Guofan', 'url': 'https://en.wikipedia.org/wiki?curid=457455', 'type_': None, 'indexer': 0, 'score': 81.17063903808594, 'norm_score': 0.4463794058794387}
{'raw_score': 326.7002258300781, 'id': 2350980, 'wikipedia_id': 18998750, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Reuters', 'url': 'https://en.wikipedia.org/wiki?curid=18998750', 'type_': None, 'indexer': 0, 'score': 83.73574829101562, 'norm_score': 0.5179096424892523}
{'raw_score': 323.9019775390625, 'id': 12748, 'wikipedia_id': 25734, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Taiwan', 'url': 'https://en.wikipedia.org/wiki?curid=25734', 'type_': None, 'indexer': 0, 'score': 83.23236083984375, 'norm_score': 0.5191689144655522}
{'raw_score': 325.56353759765625, 'id': 2397, 'wikipedia_id': 5405, 'wikidata_qid': None, 'redirects_to': None, 'title': 'China', 'url': 'https://en.wikipedia.org/wiki?curid=5405', 'type_': None, 'indexer': 0, 'score': 81.94345092773438, 'norm_score': 0.5035393026537852}
{'raw_score': 327.9543151855469, 'id': 28984, 'wikipedia_id': 57648, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Taipei', 'url': 'https://en.wikipedia.org/wiki?curid=57648', 'type_': None, 'indexer': 0, 'score': 80.64089965820312, 'norm_score': 0.4837750612810223}
{'raw_score': 339.0580749511719, 'id': 555105, 'wikipedia_id': 2099360, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Tang Fei', 'url': 'https://en.wikipedia.org/wiki?curid=2099360', 'type_': None, 'indexer': 0, 'score': 79.37348937988281, 'norm_score': 0.43356447857278485}
{'raw_score': 323.1243591308594, 'id': 12748, 'wikipedia_id': 25734, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Taiwan', 'url': 'https://en.wikipedia.org/wiki?curid=25734', 'type_': None, 'indexer': 0, 'score': 83.52281188964844, 'norm_score': 0.5209806275385797}
{'raw_score': 325.2178955078125, 'id': 129288, 'wikipedia_id': 263163, 'wikidata_qid': None, 'redirects_to': None, 'title': "People's Daily", 'url': 'https://en.wikipedia.org/wiki?curid=263163', 'type_': None, 'indexer': 0, 'score': 85.50372314453125, 'norm_score': 0.49932806565854604}
{'raw_score': 329.8238525390625, 'id': 4192559, 'wikipedia_id': 39652240, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Tāng (surname)', 'url': 'https://en.wikipedia.org/wiki?curid=39652240', 'type_': None, 'indexer': 0, 'score': 79.84367370605469, 'norm_score': 0.48148635451503247}
{'raw_score': 329.2820739746094, 'id': 5744441, 'wikipedia_id': 59178284, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Shen Chang-huan', 'url': 'https://en.wikipedia.org/wiki?curid=59178284', 'type_': None, 'indexer': 0, 'score': 79.95675659179688, 'norm_score': 0.43394921489291627}
{'raw_score': 333.9736633300781, 'id': 5134298, 'wikipedia_id': 51220220, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Reuters TV', 'url': 'https://en.wikipedia.org/wiki?curid=51220220', 'type_': None, 'indexer': 0, 'score': 81.90070343017578, 'norm_score': 0.46144705581830214}
{'raw_score': 327.4975891113281, 'id': 2273003, 'wikipedia_id': 18353912, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Ignacio Milam Tang', 'url': 'https://en.wikipedia.org/wiki?curid=18353912', 'type_': None, 'indexer': 0, 'score': 81.07706451416016, 'norm_score': 0.4543629102792197}
{'raw_score': 322.83648681640625, 'id': 12748, 'wikipedia_id': 25734, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Taiwan', 'url': 'https://en.wikipedia.org/wiki?curid=25734', 'type_': None, 'indexer': 0, 'score': 83.43959045410156, 'norm_score': 0.5204615267715562}
{'raw_score': 328.26019287109375, 'id': 2301073, 'wikipedia_id': 18603746, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Beijing', 'url': 'https://en.wikipedia.org/wiki?curid=18603746', 'type_': None, 'indexer': 0, 'score': 80.88457489013672, 'norm_score': 0.4814991877213832}
{'raw_score': 326.48834228515625, 'id': 2397, 'wikipedia_id': 5405, 'wikidata_qid': None, 'redirects_to': None, 'title': 'China', 'url': 'https://en.wikipedia.org/wiki?curid=5405', 'type_': None, 'indexer': 0, 'score': 80.95240020751953, 'norm_score': 0.49744933471978137}
{'raw_score': 327.7627258300781, 'id': 28984, 'wikipedia_id': 57648, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Taipei', 'url': 'https://en.wikipedia.org/wiki?curid=57648', 'type_': None, 'indexer': 0, 'score': 80.20447540283203, 'norm_score': 0.4811568964070991}
{'raw_score': 319.93878173828125, 'id': 15756, 'wikipedia_id': 31750, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Ukraine', 'url': 'https://en.wikipedia.org/wiki?curid=31750', 'type_': None, 'indexer': 0, 'score': 85.62303161621094, 'norm_score': 0.5414158352014888}
{'raw_score': 326.2209777832031, 'id': 12748, 'wikipedia_id': 25734, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Taiwan', 'url': 'https://en.wikipedia.org/wiki?curid=25734', 'type_': None, 'indexer': 0, 'score': 81.30545806884766, 'norm_score': 0.5071496948999357}
{'raw_score': 334.766357421875, 'id': 82451, 'wikipedia_id': 140551, 'wikidata_qid': None, 'redirects_to': None, 'title': 'Lien Chan', 'url': 'https://en.wikipedia.org/wiki?curid=140551', 'type_': None, 'indexer': 0, 'score': 80.24974822998047, 'norm_score': 0.42383210722520176}


gdoc = gdocBi


# reranking the candidates with cross-encoder (expected to be more accurate than only the bi-encoder)
# it's normal if this call is slow, do not launch too many times this cell

res = requests.post(api_crossencoder, auth = auth, json = gdoc.to_dict())


if res.ok:
    doc = res.json()
    gdoc = Document.from_dict(doc)
else:
    print('error')
    print(res.content)
    
gdoc


for ann in gdoc.annset('entities_merged'):
    break
ann.features['linking']['candidates']

[{'id': 2397,
  'title': 'China',
  'url': 'https://en.wikipedia.org/wiki?curid=5405',
  'indexer': 0,
  'score': 6.211941719055176,
  'bi_score': 81.44474029541016,
  'raw_score': 326.15081787109375,
  'is_cross': True,
  'wikipedia_id': 5405,
  'type_': None,
  'norm_score': 0.5004747453135283},
 {'id': 12748,
  'title': 'Taiwan',
  'url': 'https://en.wikipedia.org/wiki?curid=25734',
  'indexer': 0,
  'score': -1.6825027465820312,
  'bi_score': 78.55805969238281,
  'raw_score': 331.9241638183594,
  'is_cross': True,
  'wikipedia_id': 25734,
  'type_': None,
  'norm_score': 0.4900125643617512},
 {'id': 750050,
  'title': 'Government of China',
  'url': 'https://en.wikipedia.org/wiki?curid=3205521',
  'indexer': 0,
  'score': -6.467350482940674,
  'bi_score': 78.4769287109375,
  'raw_score': 332.08642578125,
  'is_cross': True,
  'wikipedia_id': 3205521,
  'type_': None,
  'norm_score': 0.4881161747591437},
 {'id': 1403259,
  'title': 'Government of the Republic of China',
  'url': 'https://en.wikipedia.org/wiki?curid=8717276',
  'indexer': 0,
  'score': -6.642155170440674,
  'bi_score': 78.0320053100586,
  'raw_score': 332.97625732421875,
  'is_cross': True,
  'wikipedia_id': 8717276,
  'type_': None,
  'norm_score': 0.48854284955788235},
 {'id': 67524,
  'title': 'China Airlines',
  'url': 'https://en.wikipedia.org/wiki?curid=124485',
  'indexer': 0,
  'score': -6.757944583892822,
  'bi_score': 77.9298095703125,
  'raw_score': 333.1806640625,
  'is_cross': True,
  'wikipedia_id': 124485,
  'type_': None,
  'norm_score': 0.44623364529866577},
 {'id': 1120926,
  'title': 'United States of China',
  'url': 'https://en.wikipedia.org/wiki?curid=5980963',
  'indexer': 0,
  'score': -7.1964030265808105,
  'bi_score': 78.06914520263672,
  'raw_score': 332.9019775390625,
  'is_cross': True,
  'wikipedia_id': 5980963,
  'type_': None,
  'norm_score': 0.480572955431974},
 {'id': 322457,
  'title': 'China Times',
  'url': 'https://en.wikipedia.org/wiki?curid=987480',
  'indexer': 0,
  'score': -7.954238414764404,
  'bi_score': 78.10570526123047,
  'raw_score': 332.82891845703125,
  'is_cross': True,
  'wikipedia_id': 987480,
  'type_': None,
  'norm_score': 0.4364163766788583},
 {'id': 5762366,
  'title': 'Foreign policy of China',
  'url': 'https://en.wikipedia.org/wiki?curid=59502393',
  'indexer': 0,
  'score': -8.213769912719727,
  'bi_score': 78.18390655517578,
  'raw_score': 332.6724853515625,
  'is_cross': True,
  'wikipedia_id': 59502393,
  'type_': None,
  'norm_score': 0.4912969349358342},
 {'id': 4343573,
  'title': 'China Current',
  'url': 'https://en.wikipedia.org/wiki?curid=41257924',
  'indexer': 0,
  'score': -8.65864372253418,
  'bi_score': 77.98200988769531,
  'raw_score': 333.0762939453125,
  'is_cross': True,
  'wikipedia_id': 41257924,
  'type_': None,
  'norm_score': 0.45241265356018845},
 {'id': 157682,
  'title': 'China (disambiguation)',
  'url': 'https://en.wikipedia.org/wiki?curid=356842',
  'indexer': 0,
  'score': -12.302020072937012,
  'bi_score': 77.96675109863281,
  'raw_score': 333.1067810058594,
  'is_cross': True,
  'wikipedia_id': 356842,
  'type_': None,
  'norm_score': 0.5666465468999536}]


# Build a dictionary that links entities to their found wikipedia page title for the Naive NEL
NaiveLink = {}
for ann in best_candidates:
    NaiveLink[ann] = best_candidates[ann][0].original_title


# Build two lists, one for names of the entities and one for the found wikipedia page title for the bi-encoder
names = []
BiCandidate = []
for ann in gdocBi.annset('entities_merged'):
    names.append(txt[ann.start:ann.end])
    BiCandidate.append(ann.features['linking']['top_candidate']['title'])


# Build one list, for the found wikipedia page title for the BLINK approach
BlinkCandidate = []
i = 0
for ann in gdoc.annset('entities_merged'):
    # If the two lists of names of the entities are not exactly the same we risk to build a wrong dataframe. Raise an erro.
    if txt[ann.start:ann.end] != names[i]:
        print('Error, the two lists of entities are not the same: ' + txt[ann.start:ann.end] + ' is not equal to ' + names[i])
        break
    i += 1
    BlinkCandidate.append(ann.features['linking']['top_candidate']['title'])


# Build one list, for the found wikipedia page title for the naive approach, using the previously built linking dictionary
NaiveCandidate = []
for name in names:
    NaiveCandidate.append(NaiveLink[name])


data = {'names': names,
    'NaiveCandidate': NaiveCandidate,
    'BiCandidate': BiCandidate,
    'BlinkCandidate': BlinkCandidate}


df = pd.DataFrame(data)
df


start = []
for ann in gdocBi.annset('entities_merged'):
    start.append(ann.start)


df.index = start


df.head()


correct = 0

for gold_ann in gold_annset:
    # get the title found by the NEL approach
    candidate = df.loc[gold_ann.start]['NaiveCandidate']
    # get the title of the link found by the golden standard
    goldTitle = gold_ann.features['link'].split("/")[-1].replace("_", " ")
    if candidate == goldTitle:
        correct += 1
    
accuracy = correct / len(gold_annset)
accuracy

0.3125


true_positive = 0
false_negative = 0

for gold_ann in gold_annset:
    # get the title found by the NEL approach
    candidate = df.loc[gold_ann.start]['NaiveCandidate']
    # get the title of the link found by the golden standard
    goldTitle = gold_ann.features['link'].split("/")[-1].replace("_", " ")
    if candidate == goldTitle:
        true_positive += 1
    else:
        false_negative += 1
    
exact_recall = true_positive / (true_positive + false_negative)
exact_recall

0.3125


correct = 0

for gold_ann in gold_annset:
    # get the title found by the NEL approach
    candidate = df.loc[gold_ann.start]['BiCandidate']
    # get the title of the link found by the golden standard
    goldTitle = gold_ann.features['link'].split("/")[-1].replace("_", " ")
    if candidate == goldTitle:
        correct += 1
    
accuracy = correct / len(gold_annset)
accuracy

0.3125


correct = 0

for gold_ann in gold_annset:
    # get the title found by the NEL approach
    candidate = df.loc[gold_ann.start]['BlinkCandidate']
    # get the title of the link found by the golden standard
    goldTitle = gold_ann.features['link'].split("/")[-1].replace("_", " ")
    if candidate == goldTitle:
        correct += 1
    
accuracy = correct / len(gold_annset)
accuracy

0.34375

	mention	link	similarity
0	China	http://dbpedia.org/resource/Taiwan	0.9576150612102466
1	Taiwan	http://dbpedia.org/resource/Taiwan	0.9999886037265016
2	BEIJING	http://dbpedia.org/resource/Government_of_China	0.9633710026526001
3	China	http://dbpedia.org/resource/Taiwan	0.9576150612102466
4	Taipei	http://dbpedia.org/resource/Government_of_the_...	0.976859540640933
5	Taiwan Strait	http://dbpedia.org/resource/Cross-Strait_relat...	0.9999997051382232
6	Ukraine	http://dbpedia.org/resource/Ukraine	0.9997112846180765
7	Taiwanese	http://dbpedia.org/resource/Taiwanese_people	0.9931038004023124
8	Vice President	http://dbpedia.org/resource/Vice_President_of_...	0.9999999990778861
9	Chan	http://dbpedia.org/resource/Jackie_Chan	0.7107728194134195
10	Beijing	http://dbpedia.org/resource/Government_of_China	0.9633710026526001
11	Chinese	http://dbpedia.org/resource/Chinese_language	0.6431193698723345
12	Taiwan	http://dbpedia.org/resource/Taiwan	0.9999886037265016
13	Foreign Ministry	http://dbpedia.org/resource/Ministry_of_Foreig...	0.9999999998583462
14	Reuters	http://dbpedia.org/resource/Reuters	0.9999999982114787
15	Taiwan	http://dbpedia.org/resource/Taiwan	0.9999886037265016
16	China	http://dbpedia.org/resource/Taiwan	0.9576150612102466
17	Taipei	http://dbpedia.org/resource/Government_of_the_...	0.976859540640933
18	Tang	http://dbpedia.org/resource/Names_of_China	0.7477847217912609
19	Taiwan	http://dbpedia.org/resource/Taiwan	0.9999886037265016
20	Tang	http://dbpedia.org/resource/Names_of_China	0.7477847217912609
21	Reuters	http://dbpedia.org/resource/Reuters	0.9999999982114787
22	Tang	http://dbpedia.org/resource/Names_of_China	0.7477847217912609
23	Taiwan	http://dbpedia.org/resource/Taiwan	0.9999886037265016
24	Beijing	http://dbpedia.org/resource/Government_of_China	0.9633710026526001
25	China	http://dbpedia.org/resource/Taiwan	0.9576150612102466
26	Taipei	http://dbpedia.org/resource/Government_of_the_...	0.976859540640933
27	Ukraine	http://dbpedia.org/resource/Ukraine	0.9997112846180765
28	Taiwanese	http://dbpedia.org/resource/Taiwanese_people	0.9931038004023124
29	Vice President	http://dbpedia.org/resource/Vice_President_of_...	0.9999999990778861

2. NER¶

Extract the text from the assigned dataset¶

Apply a spacy NER pipeline on the text¶

Evaluate the found spans by comparing with the gold standard spans¶

Evaluate with exact matching criteria¶

Evaluate using Right matching criteria¶

Implement and evaluate using Partial matching criteria¶

Dealing with Different Labels¶

map them with spacy and AIDA types¶

Compute a confusion matrix of types¶

Before type conversion¶

After Type Conversion¶

Compute typed-metrics by pairing a matching criteria with types¶

Before type conversion¶

After Type Conversion¶

Analyze Errors¶

3. NEL¶

Perform NEL with a naive NEL pipeline on the text¶

With dbpedia¶

With Wikipedia¶

Perform NEL with a hosted bi-encoder only BLINK instance¶

Perform NEL with a hosted BLINK instance¶

Compare results highlighting easy instances (linked by all approaches), and differences between the approaches¶

Compute accuracy and recall for the three linking approaches by compairing gold standard title with title of linked wikipedia resource¶

Naive Approach¶

Bi-encoder¶

BLINK¶

	names	NaiveCandidate	BiCandidate	BlinkCandidate
0	China	Mainland China	China	China
1	Taiwan	Taiwan	Taiwan	Taiwan
2	BEIJING	Beijing	Beijing	Beijing
3	China	Mainland China	China	China
4	Taipei	Taipei	Taipei	Taipei
5	Taiwan Strait	Taiwan Strait	Taiwan Strait	Taiwan Strait
6	Ukraine	Ukraine	Ukraine	Ukraine
7	Taiwanese	Han Taiwanese	Taiwan	Taiwan
8	Lien Chan	Lien Chan	Lien Chan	Lien Chan
9	Beijing	Beijing	Beijing	Beijing
10	Chinese	Chinese language	China	China
11	Taiwan	Taiwan	Taiwan	Taiwan
12	Foreign Ministry	Ministry of foreign affairs	Ministry of Foreign Affairs (Taiwan)	Ministry of Foreign Affairs of the People's Re...
13	Shen Guofang	Guofang Wei	Zeng Guofan	Zeng Guofan
14	Reuters	Reuters	Reuters	Reuters
15	Taiwan	Taiwan	Taiwan	Taiwan
16	China	Mainland China	China	China
17	Taipei	Taipei	Taipei	Taipei
18	Tang Shubei	Qiandao Lake incident	Tang Fei	Tang Fei
19	Taiwan	Taiwan	Taiwan	Taiwan
20	People 's Daily	People's Daily	People's Daily	People's Daily
21	Tang	Tangs	Tāng (surname)	Tang dynasty
22	Shen	O-Shen	Shen Chang-huan	Shen Chang-huan
23	Reuters Television	Reuters TV	Reuters TV	Reuters TV
24	Tang	Tangs	Ignacio Milam Tang	Tang dynasty
25	Taiwan	Taiwan	Taiwan	Taiwan
26	Beijing	Beijing	Beijing	Beijing
27	China	Mainland China	China	China
28	Taipei	Taipei	Taipei	Taipei
29	Ukraine	Ukraine	Ukraine	Ukraine
30	Taiwanese	Han Taiwanese	Taiwan	Taiwan
31	Lien	Tax lien	Lien Chan	Lien Chan