DMLEX XML upload script
From LexBib
import json, re, validators
import time
from xml.etree import cElementTree as ET
import sys, requests
from bots import xwbi # this contains the wikibaseintegrator for python
from bots import xwb # mwclient functions
source_file = "SLSH_DMlex_3.3.xml"
mappingfile = f'dmlex_mappings/{source_file.replace(".xml", ".json")}'
errorfile = f'dmlex_mappings/{source_file.replace(".xml", "_errors.csv")}'
tree = ET.ElementTree(file='dmlex_source/' + source_file)
lexicographic_resource = tree.getroot()
print(f"\nSuccessfully loaded XML source: {source_file}")
# functions
def get_mapping(dict_qid): # this gets existing wikibase entry end sense ids and their corresponding dmlex source ID
# url = f"https://lexbib.elex.is/query/sparql?format=json&query=PREFIX%20lwb%3A%20%3Chttps%3A%2F%2Flexbib.elex.is%2Fentity%2F%3E%0APREFIX%20ldp%3A%20%3Chttps%3A%2F%2Flexbib.elex.is%2Fprop%2Fdirect%2F%3E%0APREFIX%20lp%3A%20%3Chttps%3A%2F%2Flexbib.elex.is%2Fprop%2F%3E%0APREFIX%20lps%3A%20%3Chttps%3A%2F%2Flexbib.elex.is%2Fprop%2Fstatement%2F%3E%0APREFIX%20lpq%3A%20%3Chttps%3A%2F%2Flexbib.elex.is%2Fprop%2Fqualifier%2F%3E%0A%0Aselect%20%3Fsource_id%20%3Flid%0A%0Awhere%20%7B%0A%20%20%3Fentry%20ldp%3AP207%20lwb%3A{dict_qid}.%0A%20%20%7B%3Fentry%20lp%3AP5%20%5Blps%3AP5%20lwb%3AQ111%3B%20lpq%3AP186%20%3Fsource_id%5D.%20bind%20(strafter(str(%3Fentry)%2Cstr(lwb%3A))%20as%20%3Flid)%7D%20%0A%20%20union%20%0A%20%20%7B%3Fentry%20ontolex%3Asense%20%3Fsense.%20%3Fsense%20lp%3AP5%20%5Blps%3AP5%20lwb%3AQ112%3B%20lpq%3AP186%20%3Fsource_id%5D.%20bind%20(strafter(str(%3Fsense)%2Cstr(lwb%3A))%20as%20%3Flid)%7D%0A%20%7D"
url = f"https://lexbib.elex.is/query/sparql?format=json&query=PREFIX%20lwb%3A%20%3Chttps%3A%2F%2Flexbib.elex.is%2Fentity%2F%3E%0APREFIX%20ldp%3A%20%3Chttps%3A%2F%2Flexbib.elex.is%2Fprop%2Fdirect%2F%3E%0APREFIX%20lp%3A%20%3Chttps%3A%2F%2Flexbib.elex.is%2Fprop%2F%3E%0APREFIX%20lps%3A%20%3Chttps%3A%2F%2Flexbib.elex.is%2Fprop%2Fstatement%2F%3E%0APREFIX%20lpq%3A%20%3Chttps%3A%2F%2Flexbib.elex.is%2Fprop%2Fqualifier%2F%3E%0A%0Aselect%20%3Fsource_id%20%3Flid%0A%0Awhere%20%7B%0A%20%20%3Fentry%20ldp%3AP207%20lwb%3A{dict_qid}.%0A%20%20%7B%3Fentry%20lp%3AP5%20%5Blps%3AP5%20lwb%3AQ111%3B%20lpq%3AP186%20%3Fsource_id%5D.%20bind%20%28strafter%28str%28%3Fentry%29%2Cstr%28lwb%3A%29%29%20as%20%3Flid%29%7D%20%0A%20%20union%20%0A%20%20%7B%3Fentry%20ontolex%3Asense%20%3Fsense.%20%3Fsense%20lp%3AP5%20%5Blps%3AP5%20lwb%3AQ112%3B%20lpq%3AP186%20%3Fsource_id%5D.%20bind%20%28strafter%28str%28%3Fsense%29%2Cstr%28lwb%3A%29%29%20as%20%3Flid%29%7D%0A%20%7D"
print(url)
r = requests.get(url)
entries = r.json()['results']['bindings']
print(entries)
mapping = {}
for entry in entries:
if entry['source_id']['value'] in mapping:
print(
f"Error: duplicate source ID {entry['source_id']['value']} - https://lexbib.elex.is/entity/{mapping[entry['source_id']['value']]}")
time.sleep(1)
mapping[entry['source_id']['value']] = entry['lid']['value']
return mapping
def dump_controldata(controldata):
with open(mappingfile, "w", encoding="utf-8") as metafile:
json.dump(controldata, metafile, indent=2)
def get_langdata(ietf_code):
print(f"Getting wiki code and wikibase item for language {ietf_code} (IETF) from LexBib Wikibase...")
query = "select ?iso_639_3 ?langCode_wiki ?langCode_item where { "
query += f'?langCode_item xdp:P32 ?iso_639_3; xdp:P43 ?langCode_wiki; xdp:P185 "{ietf_code}".'
query += "}"
results = \
xwbi.wbi_helpers.execute_sparql_query(query=query, prefix=xwbi.config['mapping']['wikibase_sparql_prefixes'],
endpoint=xwbi.config['mapping']['wikibase_sparql_endpoint'])['results'][
'bindings']
print(results)
return {'langCode_wiki': results[0]['langCode_wiki']['value'],
'langCode_item': results[0]['langCode_item']['value'].replace(xwbi.config['mapping']['wikibase_entity_ns'],
"")}
def get_label_claim(label_element):
if 'tag' not in label_element.attrib:
print(f"Fatal error: There is no 'tag' attrib in this element.")
report_error(entry_id=entry_id, problem="no tag attrib in label element")
return None
label = label_element.attrib['tag']
if label not in controldata['labelTag']:
print(f"Fatal error: The label '{label}' is not a defined controlled value.")
report_error(entry_id=entry_id, problem=f"label '{label}' is not a defined controlled value")
return None
label_qid = controldata['labelTag'][label]
print(f"Adding label '{label}' as '{label_qid}'.")
return xwbi.Item(prop_nr="P203", value=label_qid,
qualifiers=[xwbi.String(prop_nr="P197", value=label)]) # dmlex label item / tag in source
def get_pronunciation_claim(pronunciation_element):
# !! This takes the text from the first transcription element. In other words,
# !! <proncuncation><transcription> is conflated to one (P204 "pronunciation", string).
# TODO: What to do if several transcription elements in one pronunciation element
for transcription_element in pronunciation_element.findall('transcription'):
pron_text = None
pron_scheme_qualifiers = []
if 'scheme' in transcription_element.attrib:
pron_scheme = transcription_element.attrib['scheme']
if pron_scheme not in controldata['transcriptionSchemeTag']:
print(
f"Fatal error: The transcription scheme tag '{pron_scheme}' is not a defined controlled value.")
report_error(entry_id=entry_id, problem="transcription scheme tag is not a controlled value")
else:
pron_scheme_qualifiers = [xwbi.Item(prop_nr="194", value=controldata['transcriptionSchemeTag'][
pron_scheme])] # transcription scheme tag
for pron_text_element in transcription_element.findall('text'):
pron_text = pron_text_element.text.strip()
if pron_text:
return xwbi.String(prop_nr="P204", value=pron_text,
qualifiers=pron_scheme_qualifiers) # dmlex pronunciation
else:
report_error(entry_id=entry_id, problem="pronunciation transcription text element empty")
return None
def get_etymology_claim(etymon_element):
note_text = None
for note_element in etymon_element.findall('note'):
note_text = note_element.text.strip()
for etymunit_element in etymon_element.findall('etymonUnit'):
for etymunit_text_element in etymunit_element.findall('text'):
etymunit_text = etymunit_text_element.text.strip()
if note_text:
qualifiers = [xwbi.MonolingualText(prop_nr="P226", language=langCode_wiki, text=note_text)] # takes main dict lang as language here
else:
qualifiers = []
return xwbi.String(prop_nr="P227", value=etymunit_text,
qualifiers=qualifiers) # takes for granted that there is no second <etymonUnit> element in this <etymon> element
def write_relation(origin=None, target=None, prop=None, relqid=None):
if prop == "P214": # relation to entry
quali = xwbi.Item(prop_nr="P223", value=relqid)
claim = xwbi.Lexeme(prop_nr="P214", value=target, qualifiers=[quali])
subject_lexeme = xwbi.wbi.lexeme.get(entity_id=origin)
subject_lexeme.claims.add(claim, action_if_exists=xwbi.ActionIfExists.APPEND_OR_REPLACE)
subject_lexeme.write()
print(
f" ...written relation to https://lexbib.elex.is/entity/{origin} - {prop} - {target}")
time.sleep(0.8)
# with open(relations_mappingfile, 'a') as file:
# file.write(json.dumps({
# "subject": member_roles["reference-origin"],
# "prop": relation_prop,
# "type": relation_type,
# "object": member_roles["reference-target"]
# })+"\n")
else:
print(
f"Fatal error: relation prop not implemented in script: {relation_prop}") # TODO: relations that connect two senses or point from entry to sense
sys.exit()
def process_inflected_form(form_element=None, lang=None):
new_form = xwbi.Form()
inflected_form_rep = None
for inflected_form_text_element in form_element.findall('text'):
if inflected_form_rep:
print(f"Fatal error: There is more than one 'text' element in this 'inflected form' element.")
report_error(entry_id=entry_id, problem="More than one text element in inflected form")
continue
inflected_form_rep = inflected_form_text_element.text.strip()
print(f"+++ Adding new Form with the representation '{inflected_form_rep}'")
new_form.representations.set(language=lang, value=inflected_form_rep)
# inflected form tag as statement to Form
if 'tag' in form_element.attrib:
if_tag = inflected_form_element.attrib['tag']
print(f"+++ +++ Inflected form tag is '{if_tag}'")
if if_tag not in controldata['inflectedFormTag']:
print(f"Fatal error: The inflected form tag '{if_tag}' is not a defined controlled value.")
report_error(entry_id=entry_id, problem=f"inflected form tag '{if_tag}' is not a defined controlled value")
return None
if_tag_qid = controldata['inflectedFormTag'][if_tag]
claim = xwbi.Item(prop_nr="P190", value=if_tag_qid, qualifiers=[xwbi.String(prop_nr="P197", value=if_tag)])
new_form.claims.add(claim, action_if_exists=xwbi.ActionIfExists.FORCE_APPEND)
for inflected_form_label_element in form_element.findall('label'):
label_claim = get_label_claim(inflected_form_label_element)
if label_claim:
new_form.claims.add(label_claim)
print(f"Added inflected form label")
for pronunciation_label in form_element.findall('pronunciation'):
pronunciation_claim = get_pronunciation_claim(pronunciation_label)
if pronunciation_claim:
new_form.claims.add(pronunciation_claim)
print(f"Added inflected form pronunciation")
return new_form
def report_error(entry_id=None, problem=None):
with open(errorfile, 'a') as outfile:
outfile.write(entry_id + "\t" + problem + "\n")
# main part of the script
try:
with open(mappingfile, "r", encoding="utf-8") as metafile:
controldata = json.load(metafile)
except:
controldata = {'title': lexicographic_resource.attrib['title'], 'uri': lexicographic_resource.get('uri')}
if not validators.url(controldata['uri']):
controldata['uri'] = None
print(controldata)
# define dictionary source language
if 'langCode_wiki' in controldata and 'langCode_item' in controldata:
pass
else: # get language data from Wikibase
if 'langCode' not in lexicographic_resource.attrib:
print("Fatal error: 'langCode' attribute missing in <lexicographicResource>.")
sys.exit()
lang_data = get_langdata(lexicographic_resource.attrib['langCode'])
controldata['langCode_wiki'] = lang_data['langCode_wiki']
controldata['langCode_item'] = lang_data['langCode_item']
dump_controldata(controldata)
langCode_wiki = controldata['langCode_wiki']
langCode_item = controldata['langCode_item']
# define dictionary translation language
# contrary to the dmlex docs, this assumes that the translation language "listing order" is implied in the order of the elements (the "listingOrder" attrib is not there in the dmlex xml source datasets)
if 'trans_langCodes' in controldata: # assumes that the dict has only one translation language
for x in controldata['trans_langCodes']:
trans_lang_code = x['langCode_wiki']
trans_lang_item = x['langCode_item']
else:
controldata['trans_langCodes'] = []
for trans_lang in lexicographic_resource.findall('translationLanguage'):
if 'langCode' in trans_lang.attrib:
trans_lang_code = trans_lang.attrib['langCode']
lang_data = get_langdata(trans_lang_code)
controldata['trans_langCodes'].append(
{'langCode_wiki': lang_data['langCode_wiki'],
'langCode_item': lang_data['langCode_item']}
)
# define dictionary item on Wikibase
if "dictionary_item" in controldata:
dict_qid = controldata['dictionary_item'] # dict already exists on Wikibase
else: # create item describing dictionary
labels = [{'lang': langCode_wiki, 'value': controldata['title']}]
if langCode_wiki != "en":
labels.append({'lang': 'en', 'value': controldata['title']})
statements = [
{'type': 'item', 'prop_nr': 'P5', 'value': 'Q100'}, # instance of dmlex lexicographical resource
{'type': 'monolingualtext', 'prop_nr': 'P6', 'value': controldata['title'], 'lang': langCode_wiki},
{'type': 'item', 'prop_nr': 'P150', 'value': controldata['langCode_item']}
]
trans_lang_listpos = 0
trans_lang_item = None
for trans_lang in controldata['trans_langCodes']:
trans_lang_listpos += 1
statements.append({'type': 'item', 'prop_nr': 'P134', 'value': trans_lang['langCode_item'],
'qualifiers': [{'type': 'string', 'prop_nr': 'P33', 'value': str(trans_lang_listpos)}]})
trans_lang_item = trans_lang[
'langCode_item'] # assumes that there is only one translang; this will be used for creating headword translation lexemes
if controldata['uri']:
statements.append({'type': 'url', 'prop_nr': 'P112', 'value': controldata['uri']})
itemdata = {'qid': False, 'statements': statements, 'labels': labels}
dict_qid = xwbi.itemwrite(itemdata)
print(f"Item describing dictionary has been created: https://lexbib.elex.is/entity/{dict_qid}")
controldata['dictionary_item'] = dict_qid
dump_controldata(controldata)
time.sleep(3)
# process controlled values
controlled_value_groups = {
'labelTypeTag': 'Q104',
'labelTag': 'Q103',
'definitionTypeTag': 'Q101',
'partOfSpeechTag': 'Q105',
'transcriptionSchemeTag': 'Q107',
'inflectedFormTag': 'Q102',
'sourceIdentityTag': 'Q106' # if this in the data is not used for a controlled value and is treated as literal
} # values for 'instance of' in items describing tags
for cv in controlled_value_groups:
if cv not in controldata:
controldata[cv] = {}
for tagname in lexicographic_resource.findall(cv):
val = tagname.attrib['tag']
if 'typeTag' in tagname.attrib:
tag_type = tagname.attrib['typeTag']
if tag_type not in controldata['labelTypeTag']:
print(f"Fatal error: Tag type '{tag_type}' is unknown.")
report_error(entry_id="* Tag Type", problem=f" Tag type '{tag_type}' is unknown.")
tag_type = None
else:
tag_type = None
descriptions = [{'lang': 'en', 'value': "Tag in " + controldata['title']}]
for description_element in tagname.findall('description'):
descriptions = [{'lang': 'en', 'value': description_element.text.strip() + " in " + controldata['title']}]
print(f"\nWill check value '{val}' of controlled value group '{cv}'...")
if val in controldata[cv]:
print(f"Wikibase item for {val} ({cv}) is already there: {controldata[cv][val]}")
else:
print(f"Need to create Wikibase item for {val} ({cv})...")
labels = [{'lang': langCode_wiki, 'value': val}]
if langCode_wiki != "en":
labels.append({'lang': 'en', 'value': val})
statements = [
{'type': 'item', 'prop_nr': 'P5', 'value': controlled_value_groups[cv]}, # instance of (cv group)
{'type': 'item', 'prop_nr': 'P207', 'value': dict_qid},
{'type': 'string', 'prop_nr': 'P197', 'value': val, 'qualifiers': [
{'type': 'item', 'prop_nr': 'P207', 'value': dict_qid}
]}
]
if tag_type:
statements.append({'type': 'item', 'prop_nr': 'P191', 'value': controldata['labelTypeTag'][tag_type]})
itemdata = {'qid': False, 'statements': statements, 'labels': labels, 'descriptions': descriptions}
val_qid = xwbi.itemwrite(itemdata)
controldata[cv][val] = val_qid
dump_controldata(controldata)
time.sleep(1.5)
# check structure of entries
if 'structure' not in controldata:
print("\n...Checking XML content...")
structure = {'lr': {}, 'entry': {}, 'sense': {}}
for lr_element in lexicographic_resource:
if lr_element.tag not in structure['lr']:
structure['lr'][lr_element.tag] = []
elif lr_element.tag != "entry":
for attr in lr_element.attrib:
if attr not in structure['lr'][lr_element.tag]:
structure['lr'][lr_element.tag].append(attr)
entry_count = 0
for entry_to_check in lexicographic_resource.findall('entry'):
entry_count += 1
for element in entry_to_check:
if element.tag not in structure['entry']:
structure['entry'][element.tag] = {'attribs': [], 'sub_elements': {}}
else:
for attr in element.attrib:
if attr not in structure['entry'][element.tag]['attribs']:
structure['entry'][element.tag]['attribs'].append(attr)
actual_sub_elements = {}
for sub_element in element:
if sub_element.tag != "sense" and sub_element.tag not in actual_sub_elements:
actual_sub_elements[sub_element.tag] = 1
elif sub_element.tag != "sense":
actual_sub_elements[sub_element.tag] += 1
for sub_element_tag in actual_sub_elements:
if sub_element_tag not in structure['entry'][element.tag]['sub_elements']:
structure['entry'][element.tag]['sub_elements'][sub_element_tag] = actual_sub_elements[
sub_element_tag]
elif actual_sub_elements[sub_element_tag] > structure['entry'][element.tag]['sub_elements'][
sub_element_tag]:
structure['entry'][element.tag]['sub_elements'][sub_element_tag] = actual_sub_elements[
sub_element_tag]
for sense_element_to_check in entry_to_check.findall('sense'):
for s_element in sense_element_to_check:
if s_element.tag not in structure['sense']:
structure['sense'][s_element.tag] = []
else:
for attr in s_element.attrib:
if attr not in structure['sense'][s_element.tag]:
structure['sense'][s_element.tag].append(attr)
controldata['number_of_entries'] = entry_count
controldata['structure'] = structure
dump_controldata(controldata)
# relation types
if 'relation_types' not in controldata:
controldata['relation_types'] = {}
for relation_type_element in lexicographic_resource.findall('relationType'):
relation_type = relation_type_element.attrib['type']
print(f"Will check relation type '{relation_type}'...")
if relation_type in controldata['relation_types']:
print(f"Relation type '{relation_type}' is already there as '{controldata['relation_types'][relation_type]}'.")
else:
labels = [{'lang': 'en', 'value': relation_type}]
descriptions = [{'lang': 'en', 'value': "Relation in " + controldata['title']}]
for description_element in relation_type_element.findall('description'):
descriptions = [{'lang': 'en', 'value': description_element.text.strip() + " in " + controldata['title']}]
rel_class = "Q109" # default is bidirectional relation type
rel_prop = "P214" # default range is Lexeme
rel_target_type = "entry" # default target type is entry
for member_type_element in relation_type_element.findall('memberType'):
if 'role' in member_type_element.attrib:
if member_type_element.attrib['role'] == "reference_target":
rel_class = "Q110" # unidirectional relation type
rel_target_type = member_type_element.attrib['type']
if rel_target_type == "sense":
rel_prop = "P215" # target is sense (not entry)
break
statements = [
{'type': 'item', 'prop_nr': 'P5', 'value': rel_class},
# instance of dmlex relation type (unidirectional or bidirectional)
{'type': 'property', 'prop_nr': 'P216', 'value': rel_prop},
# prop to use, depending on range Lexeme or Sense
{'type': 'item', 'prop_nr': 'P207', 'value': dict_qid},
{'type': 'string', 'prop_nr': 'P197', 'value': relation_type, 'qualifiers': [
{'type': 'item', 'prop_nr': 'P207', 'value': dict_qid}
]}
]
itemdata = {'qid': False, 'statements': statements, 'labels': labels, 'descriptions': descriptions}
rel_qid = xwbi.itemwrite(itemdata)
time.sleep(0.5)
controldata['relation_types'][relation_type] = {'rel_prop': rel_prop, 'rel_qid': rel_qid}
dump_controldata(controldata)
# process entries
entry_count = 0
number_of_entries = controldata['number_of_entries']
print("\nWill now get mapping of existing entities...")
source_mapping = get_mapping(dict_qid)
input(
f"\n\nLoaded {len(source_mapping)} known source_id to wikibase mappings. Press ENTER to start to process entries.")
for entry in lexicographic_resource.findall('entry'):
entry_count += 1
if 'id' not in entry.attrib:
print(f"Fatal error: There is no id attribute in this entry element.")
sys.exit()
entry_id = entry.attrib['id']
if entry_id in source_mapping:
print(f"\n[{entry_count}] Entry '{entry_id}' is already on Wikibase as {source_mapping[entry_id]}")
continue
# if "example" in entry_id: # entry of type "example", to be ignored
# print(f"Ignored EXAMPLE entry '{entry_id}'.")
# continue
source_mapping[entry_id] = {'lid': None, 'senses': {}}
# part of speech
pos_item = "Q108" # pos 'undefined'
for part_of_speech in entry.findall('partOfSpeech'):
if pos_item != "Q108": # if POS already has been set as not undefined
print(f"Fatal error: There is more than one 'partofSpeech' element in this entry.")
report_error(entry_id=entry_id,
problem=f"more than one 'partofSpeech' element in this entry. Entry treated as {pos_item} ('{pos_tag}').")
if 'tag' in part_of_speech.attrib:
pos_tag = part_of_speech.attrib['tag']
if pos_tag not in controldata['partOfSpeechTag']:
print(f"Fatal error: POS tag '{pos}' is not a known controlled value.")
report_error(entry_id=entry_id, problem=f"POS tag '{pos}' is not a known controlled value")
sys.exit()
pos_item = controldata['partOfSpeechTag'][pos_tag]
print(
f"\n[{entry_count} done / {number_of_entries - entry_count} left] Now processing entry '{entry_id}' with POS '{pos_item}'")
lexeme = xwbi.wbi.lexeme.new(language=langCode_item, lexical_category=pos_item)
# headword
headword = None
for headword_element in entry.findall('headword'):
if headword: # if headword already has been set
print(f"Fatal error: There is more than one 'headword' element in this entry.")
report_error(entry_id=entry_id, problem="There is more than one 'headword' element in this entry")
continue
headword = headword_element.text.strip()
print(f"Found headword: '{headword}'.")
lexeme.lemmas.set(language=langCode_wiki, value=headword)
claim = xwbi.Item(prop_nr="P5", value="Q111", qualifiers=[
xwbi.String(prop_nr="P186", value=entry_id) # entry id in dmlex source
]) # instance of dmlex Entry
lexeme.claims.add(claim)
claim = xwbi.Item(prop_nr="P207", value=dict_qid) # dmlex source dict
lexeme.claims.add(claim)
# label at entry level
for label_element in entry.findall('label'):
claim = get_label_claim(label_element)
lexeme.claims.add(claim, action_if_exists=xwbi.ActionIfExists.APPEND_OR_REPLACE)
# pronunciation at entry level
for pronunciation_element in entry.findall('pronunciation'):
claim = get_pronunciation_claim(pronunciation_element)
lexeme.claims.add(claim, action_if_exists=xwbi.ActionIfExists.APPEND_OR_REPLACE)
print(f"Processed one pronunciation element and added data to entry.") # gets transcription text and scheme
# TODO: possible other sub-elements to 'pronunciation' (other than 'transcription')
# TODO: possible attributes to 'pronunciation'
# etymology at entry level
for etymology_element in entry.findall('etymology'):
for etymon_element in etymology_element.findall('etymon'):
claim = get_etymology_claim(etymon_element)
lexeme.claims.add(claim, action_if_exists=xwbi.ActionIfExists.APPEND_OR_REPLACE)
print(f"Processed one etymon element and added data to entry.")
# inflectedForm
for inflected_form_element in entry.findall('inflectedForm'):
new_form = process_inflected_form(form_element=inflected_form_element, lang=langCode_wiki)
lexeme.forms.add(new_form)
lexeme.write() # due to bug (adds only the last of the new forms)
time.sleep(0.50)
# sense
headword_translations = {}
for sense_element in entry.findall('sense'):
if 'id' not in sense_element.attrib:
report_error(entry_id=entry_id,
problem=f"Sense without sense ID attribute. Sense was skipped.")
continue
sense_id_qualifiers = []
sense_id = None
else:
sense_id = sense_element.attrib['id']
sense_id_qualifiers = [xwbi.String(prop_nr="P186", value=sense_id)] # sense id in source
lexeme_sense = xwbi.Sense()
claim = xwbi.Item(prop_nr="P5", value="Q112", qualifiers=sense_id_qualifiers) # instance of dmlex Sense
lexeme_sense.claims.add(claim)
print("*** Adding new sense...")
# label at sense level
for label_element in sense_element.findall('label'):
claim = get_label_claim(label_element)
lexeme_sense.claims.add(claim, action_if_exists=xwbi.ActionIfExists.APPEND_OR_REPLACE)
# definition
# (sent to glosses in dict main language, and to a P218 statement) # GLOSS now disabled
gloss = ""
definition_text_lang = langCode_wiki # assumes that it is the main language of the dictionary here
for definition_element in sense_element.findall('definition'):
if gloss != "":
gloss += " | "
for definition_text_element in definition_element.findall('text'):
definition_text = definition_text_element.text.strip()
gloss += definition_text
definition_type_qualifiers = []
if 'definitionType' in definition_element.attrib:
definition_type = definition_element.attrib['definitionType']
if definition_type not in controldata['definitionTypeTag']:
print(
f"Fatal error: definitionTypeTag '{definition_type}' in Sense '{sense_id}' is not a known controlloed value.")
report_error(entry_id=entry_id,
problem=f"DefinitionTypeTag '{definition_type}' in Sense '{sense_id}' is not a known controlloed value.")
continue
def_type_qid = controldata['definitionTypeTag'][definition_type]
definition_type_qualifiers = [xwbi.Item(prop_nr="P189", value=def_type_qid)]
claim = xwbi.MonolingualText(prop_nr="P218", text=definition_text, language=definition_text_lang,
qualifiers=definition_type_qualifiers)
lexeme_sense.claims.add(claim, action_if_exists=xwbi.ActionIfExists.APPEND_OR_REPLACE)
print(f"Added definition '{definition_text}' for language '{definition_text_lang}'.")
# lexeme_sense.glosses.set(language=langCode_wiki, value=gloss) # gloss now disabled
# headword translation
# (sent to sense glosses and attached as P217 statement)
headword_trans_lang = controldata['trans_langCodes'][0][
'langCode_wiki'] # assumes that the main translation language is the language here
translated_gloss = ""
headword_translations[sense_id] = []
seen_translations = [] # to avoid redundant translations (present in the data)
for headword_trans_element in sense_element.findall('headwordTranslation'):
if translated_gloss != "":
translated_gloss += " | "
trans_sense_label_claims = []
for label_element in headword_trans_element.findall('label'):
if 'tag' in label_element.attrib:
tag = label_element.attrib['tag']
if tag not in controldata['labelTag']:
print(
f"Fatal error: labelTag '{tag}' in headword translation in Sense '{sense_id}' is not a known controlled value.")
report_error(entry_id=entry_id,
problem=f"LabelTag '{tag}' in headword translation in Sense '{sense_id}' is not a known controlled value.")
continue
tag_qid = controldata['labelTag'][tag]
references = xwbi.References()
reference_1 = xwbi.Reference()
reference_1.add(xwbi.String(prop_nr="P186", value=sense_id))
references.add(reference_1)
trans_sense_label_claims.append(
xwbi.Item(prop_nr="P203", value=tag_qid, qualifiers=[xwbi.String(prop_nr="P197", value=tag)],
references=references))
# headword_trans_label_qualifiers.append(xwbi.Item(prop_nr="P203", value=tag_qid))
# headword_trans_label_qualifiers.append(xwbi.String(prop_nr="P197", value=tag))
trans_forms = []
for headword_trans_inflected_form_element in headword_trans_element.findall('inflectedForm'):
new_form = process_inflected_form(form_element=headword_trans_inflected_form_element,
lang=headword_trans_lang)
trans_forms.append(new_form)
# if headword_trans and headword_trans not in seen_translations: # to be written after having got the new wikibase sense ID
# claim = xwbi.MonolingualText(prop_nr="P217", text=headword_trans, language=headword_trans_lang, qualifiers=headword_trans_label_qualifiers)
# lexeme_sense.claims.add(claim, action_if_exists=xwbi.ActionIfExists.APPEND_OR_REPLACE)
headword_trans = None
for headword_trans_text_element in headword_trans_element.findall('text'):
headword_trans = headword_trans_text_element.text.strip()
if headword_trans not in seen_translations:
headword_translations[sense_id].append(
{'trans': headword_trans, 'label_claims': trans_sense_label_claims, 'trans_forms': trans_forms})
seen_translations.append(headword_trans)
translated_gloss += headword_trans
print(
f"Have found headword translation '{headword_trans}' for language '{headword_trans_lang}' in sense '{sense_id}', saving this for writing later, when wikibase sense id is known.")
if translated_gloss != "":
lexeme_sense.glosses.set(language=headword_trans_lang, value=translated_gloss)
else: # this means that it is a sense without any headword translation
sense_id = None
if sense_id:
lexeme.senses.add(lexeme_sense)
lexeme.write(clear=False)
print(f">>> Created new lexeme: https://lexbib.elex.is/wiki/Lexeme:{lexeme.id}")
time.sleep(0.5)
source_mapping[entry_id] = lexeme.id
# get senses ID mapping after writing (when sense ids are assigned)
senses_count = 0
for wikibase_sense in lexeme.senses.get_json():
senses_count += 1
sense_id = str(senses_count) # if no proper sense_id is found (because the source has no sense id)
for p5_claim in wikibase_sense['claims']['P5']: # looks for proper sense id
if 'P186' in p5_claim['qualifiers']:
sense_id = p5_claim['qualifiers']['P186'][0]['datavalue']['value']
source_mapping[sense_id] = wikibase_sense['id']
# examples now, after knowing the sense id mapping
for sense_element in entry.findall('sense'):
if 'id' not in sense_element.attrib:
sense_id = None
sense_lid = None
else:
sense_id = sense_element.attrib['id']
if sense_id in source_mapping:
sense_lid = source_mapping[sense_id]
else:
sense_lid = None
print(f"Dummy sense '{sense_id}': sense id {sense_id} not found in mapping data.")
time.sleep(1)
for example_element in sense_element.findall('example'):
print("... Adding new example")
qualifiers = []
if sense_lid:
qualifiers.append(xwbi.SenseClaim(prop_nr="P211", value=sense_lid))
example_text = None
for example_text_element in example_element.findall('text'):
example_text = re.sub(r'<[^>]+>', '', ET.tostring(example_text_element, encoding="unicode")).strip()
# example headword and collocate markers
for headword_marker_element in example_text_element.findall('headwordMarker'):
qualifiers.append(xwbi.String(prop_nr="P219", value=headword_marker_element.text.strip()))
for collocate_marker_element in example_text_element.findall('collocateMarker'):
qualifiers.append(xwbi.String(prop_nr="P220", value=collocate_marker_element.text.strip()))
# example translation
for example_trans_element in example_element.findall('exampleTranslation'):
for example_trans_text_element in example_trans_element.findall('text'):
example_trans_text = example_trans_text_element.text.strip()
expl_trans_lang = controldata['trans_langCodes'][0]['langCode_wiki']
# this takes the first listed translation language
# TODO: ensure that there is only one translation language (and, if not, assign the correct translation language)
qualifiers.append(
xwbi.MonolingualText(prop_nr="P210", text=example_trans_text, language=expl_trans_lang))
# source identity tag
if 'sourceIdentity' in example_element.attrib:
source_identity = example_element.attrib['sourceIdentity']
if source_identity not in controldata['sourceIdentityTag']:
print(
f"Fatal error: source identity '{source_identity}' of example in sense '{sense_id}' is not a known controlled value.")
report_error(entry_id=entry_id,
problem=f"source identity '{source_identity}' of example in sense '{sense_id}' is not a known controlled value.")
else:
qualifiers.append(
xwbi.Item(prop_nr="P193", value=controldata['sourceIdentityTag'][source_identity]))
# variant for literal not item:
# qualifiers.append(xwbi.String(prop_nr="P221", value=example_element.attrib['sourceIdentity']))
# source elaboration comment attribute
if 'sourceElaboration' in example_element.attrib:
qualifiers.append(xwbi.String(prop_nr="199", value=example_element.attrib['sourceElaboration']))
claim = xwbi.MonolingualText(prop_nr="P213", text=example_text, language=langCode_wiki,
qualifiers=qualifiers)
lexeme.claims.add(claim, action_if_exists=xwbi.ActionIfExists.APPEND_OR_REPLACE)
lexeme.write()
print("Finished adding examples.")
time.sleep(0.5)
# headword translations now, after knowing the sense id mapping
for sense_id in headword_translations:
for translation in headword_translations[sense_id]:
print(f"Processing a headword translation object: '{translation['trans']}'")
# create lexeme in translation target language with a sense
trans_lexeme = xwbi.wbi.lexeme.new(language=trans_lang_item,
lexical_category=pos_item) # POS inherited from source language entry
trans_lexeme.lemmas.set(language=headword_trans_lang, value=translation['trans'])
claim = xwbi.Item(prop_nr="P5", value="Q113") # a dmlex headword translation object
trans_lexeme.claims.add(claim)
claim = xwbi.Item(prop_nr="P207", value=dict_qid) # dmlex source dict
trans_lexeme.claims.add(claim)
for label_claim in translation['label_claims']:
trans_lexeme.claims.add(label_claim)
trans_sense = xwbi.Sense()
trans_sense.glosses.set(language=langCode_wiki, value=headword)
claim = xwbi.SenseClaim(prop_nr="P225",
value=source_mapping[sense_id]) # link from target sense to origin sense
trans_sense.claims.add(claim)
trans_lexeme.senses.add(trans_sense)
if len(translation['trans_forms']) > 0:
for trans_form in translation['trans_forms']:
print(f"Processing a headword translation form: '{trans_form}'")
trans_lexeme.forms.add(trans_form)
trans_lexeme.write()
print(f"Added inflected form to translation lexeme for translation '{translation['trans']}'")
time.sleep(0.5)
else:
trans_lexeme.write()
print(
f"Created new headword translation lexeme in language '{headword_trans_lang}': '{translation['trans']}', https://lexbib.elex.is/wiki/Lexeme:{trans_lexeme.id}")
time.sleep(1)
# write link pointing from translation origin sense to new translation target sense
trans_sense_lid = trans_lexeme.id + "-S1"
origin_sense_lid = source_mapping[sense_id]
claimid = xwb.senseclaim(origin_sense_lid, "P224", trans_sense_lid)
time.sleep(0.15)
xwb.setqualifier(origin_sense_lid, "P224", claimid, "P217",
{"language": trans_lang_code, "text": translation['trans']}, "monolingualtext",
replace=False)
print(
f"\nFinished processing entry '{entry_id}', now on Wikibase, see it at 'https://lexbib.elex.is/wiki/Lexeme:{lexeme.id}'.\n")
time.sleep(1)
# input("ENTER to proceed")
# add relations
print(f"\n Starting to add relations...")
relation_count = 0
for relation in lexicographic_resource.findall('relation'):
relation_count += 1
relation_type = relation.attrib['type']
if relation_type not in controldata['relation_types']:
print(f"Fatal error: relation type {relation_type} is not a known controlled value.")
report_error(entry_id="* Relation", problem=f"Relation type {relation_type} is not a known controlled value.")
continue
relation_prop = controldata['relation_types'][relation_type]['rel_prop']
relation_qid = controldata['relation_types'][relation_type]['rel_qid']
member_roles = {"reference-origin": None, "reference-target": None, "bidirectional": []}
for member in relation.findall('member'):
if 'role' not in member.attrib:
member_roles['bidirectional'].append(member.attrib['ref'])
elif member.attrib['role'] in member_roles:
member_roles[member.attrib['role']] = member.attrib['ref']
if len(member_roles['bidirectional']) == 2 and "example" not in str(member_roles):
print(
f"[{relation_count}] Will write bidirectional relation to: {member_roles['bidirectional']}, relation type is '{relation_type}'.")
write_relation(origin=source_mapping[member_roles['bidirectional'][0]],
target=source_mapping[member_roles['bidirectional'][1]], prop=relation_prop, relqid=relation_qid)
write_relation(origin=source_mapping[member_roles['bidirectional'][1]],
target=source_mapping[member_roles['bidirectional'][0]], prop=relation_prop, relqid=relation_qid)
elif member_roles["reference-origin"] and member_roles["reference-target"]:
print(
f"[{relation_count}] Will write unidirectional relation to: '{member_roles['reference-origin']}' with target '{member_roles['reference-target']}', relation type is '{relation_type}'.")
write_relation(origin=source_mapping[member_roles['reference-origin']],
target=source_mapping[member_roles['reference-target']], prop=relation_prop, relqid=relation_qid)
else:
print(f"[{relation_count}] Fatal error: relation data not valid: {member_roles}")
report_error(entry_id="* Relation",
problem=f"Relation data not valid and left unwritten: {member_roles}, {ET.tostring(relation, encoding='utf8')}")