DMLEX XML upload script

From LexBib
import json, re, validators
import time
from xml.etree import cElementTree as ET
import sys, requests

from bots import xwbi  # this contains the wikibaseintegrator for python
from bots import xwb  # mwclient functions

source_file = "SLSH_DMlex_3.3.xml"
mappingfile = f'dmlex_mappings/{source_file.replace(".xml", ".json")}'
errorfile = f'dmlex_mappings/{source_file.replace(".xml", "_errors.csv")}'

tree = ET.ElementTree(file='dmlex_source/' + source_file)
lexicographic_resource = tree.getroot()
print(f"\nSuccessfully loaded XML source: {source_file}")


# functions

def get_mapping(dict_qid):  # this gets existing wikibase entry end sense ids and their corresponding dmlex source ID

    # url = f"https://lexbib.elex.is/query/sparql?format=json&query=PREFIX%20lwb%3A%20%3Chttps%3A%2F%2Flexbib.elex.is%2Fentity%2F%3E%0APREFIX%20ldp%3A%20%3Chttps%3A%2F%2Flexbib.elex.is%2Fprop%2Fdirect%2F%3E%0APREFIX%20lp%3A%20%3Chttps%3A%2F%2Flexbib.elex.is%2Fprop%2F%3E%0APREFIX%20lps%3A%20%3Chttps%3A%2F%2Flexbib.elex.is%2Fprop%2Fstatement%2F%3E%0APREFIX%20lpq%3A%20%3Chttps%3A%2F%2Flexbib.elex.is%2Fprop%2Fqualifier%2F%3E%0A%0Aselect%20%3Fsource_id%20%3Flid%0A%0Awhere%20%7B%0A%20%20%3Fentry%20ldp%3AP207%20lwb%3A{dict_qid}.%0A%20%20%7B%3Fentry%20lp%3AP5%20%5Blps%3AP5%20lwb%3AQ111%3B%20lpq%3AP186%20%3Fsource_id%5D.%20bind%20(strafter(str(%3Fentry)%2Cstr(lwb%3A))%20as%20%3Flid)%7D%20%0A%20%20union%20%0A%20%20%7B%3Fentry%20ontolex%3Asense%20%3Fsense.%20%3Fsense%20lp%3AP5%20%5Blps%3AP5%20lwb%3AQ112%3B%20lpq%3AP186%20%3Fsource_id%5D.%20bind%20(strafter(str(%3Fsense)%2Cstr(lwb%3A))%20as%20%3Flid)%7D%0A%20%7D"
    url = f"https://lexbib.elex.is/query/sparql?format=json&query=PREFIX%20lwb%3A%20%3Chttps%3A%2F%2Flexbib.elex.is%2Fentity%2F%3E%0APREFIX%20ldp%3A%20%3Chttps%3A%2F%2Flexbib.elex.is%2Fprop%2Fdirect%2F%3E%0APREFIX%20lp%3A%20%3Chttps%3A%2F%2Flexbib.elex.is%2Fprop%2F%3E%0APREFIX%20lps%3A%20%3Chttps%3A%2F%2Flexbib.elex.is%2Fprop%2Fstatement%2F%3E%0APREFIX%20lpq%3A%20%3Chttps%3A%2F%2Flexbib.elex.is%2Fprop%2Fqualifier%2F%3E%0A%0Aselect%20%3Fsource_id%20%3Flid%0A%0Awhere%20%7B%0A%20%20%3Fentry%20ldp%3AP207%20lwb%3A{dict_qid}.%0A%20%20%7B%3Fentry%20lp%3AP5%20%5Blps%3AP5%20lwb%3AQ111%3B%20lpq%3AP186%20%3Fsource_id%5D.%20bind%20%28strafter%28str%28%3Fentry%29%2Cstr%28lwb%3A%29%29%20as%20%3Flid%29%7D%20%0A%20%20union%20%0A%20%20%7B%3Fentry%20ontolex%3Asense%20%3Fsense.%20%3Fsense%20lp%3AP5%20%5Blps%3AP5%20lwb%3AQ112%3B%20lpq%3AP186%20%3Fsource_id%5D.%20bind%20%28strafter%28str%28%3Fsense%29%2Cstr%28lwb%3A%29%29%20as%20%3Flid%29%7D%0A%20%7D"
    print(url)
    r = requests.get(url)
    entries = r.json()['results']['bindings']
    print(entries)
    mapping = {}
    for entry in entries:
        if entry['source_id']['value'] in mapping:
            print(
                f"Error: duplicate source ID {entry['source_id']['value']} - https://lexbib.elex.is/entity/{mapping[entry['source_id']['value']]}")
            time.sleep(1)
        mapping[entry['source_id']['value']] = entry['lid']['value']
    return mapping


def dump_controldata(controldata):
    with open(mappingfile, "w", encoding="utf-8") as metafile:
        json.dump(controldata, metafile, indent=2)


def get_langdata(ietf_code):
    print(f"Getting wiki code and wikibase item for language {ietf_code} (IETF) from LexBib Wikibase...")
    query = "select ?iso_639_3 ?langCode_wiki ?langCode_item where { "
    query += f'?langCode_item xdp:P32 ?iso_639_3; xdp:P43 ?langCode_wiki; xdp:P185 "{ietf_code}".'
    query += "}"

    results = \
        xwbi.wbi_helpers.execute_sparql_query(query=query, prefix=xwbi.config['mapping']['wikibase_sparql_prefixes'],
                                              endpoint=xwbi.config['mapping']['wikibase_sparql_endpoint'])['results'][
            'bindings']
    print(results)
    return {'langCode_wiki': results[0]['langCode_wiki']['value'],
            'langCode_item': results[0]['langCode_item']['value'].replace(xwbi.config['mapping']['wikibase_entity_ns'],
                                                                          "")}


def get_label_claim(label_element):
    if 'tag' not in label_element.attrib:
        print(f"Fatal error: There is no 'tag' attrib in this element.")
        report_error(entry_id=entry_id, problem="no tag attrib in label element")
        return None
    label = label_element.attrib['tag']
    if label not in controldata['labelTag']:
        print(f"Fatal error: The label '{label}' is not a defined controlled value.")
        report_error(entry_id=entry_id, problem=f"label '{label}' is not a defined controlled value")
        return None
    label_qid = controldata['labelTag'][label]
    print(f"Adding label '{label}' as '{label_qid}'.")
    return xwbi.Item(prop_nr="P203", value=label_qid,
                     qualifiers=[xwbi.String(prop_nr="P197", value=label)])  # dmlex label item / tag in source


def get_pronunciation_claim(pronunciation_element):
    # !! This takes the text from the first transcription element. In other words,
    # !! <proncuncation><transcription> is conflated to one (P204 "pronunciation", string).
    # TODO: What to do if several transcription elements in one pronunciation element
    for transcription_element in pronunciation_element.findall('transcription'):
        pron_text = None
        pron_scheme_qualifiers = []
        if 'scheme' in transcription_element.attrib:
            pron_scheme = transcription_element.attrib['scheme']
            if pron_scheme not in controldata['transcriptionSchemeTag']:
                print(
                    f"Fatal error: The transcription scheme tag '{pron_scheme}' is not a defined controlled value.")
                report_error(entry_id=entry_id, problem="transcription scheme tag is not a controlled value")
            else:
                pron_scheme_qualifiers = [xwbi.Item(prop_nr="194", value=controldata['transcriptionSchemeTag'][
                    pron_scheme])]  # transcription scheme tag
        for pron_text_element in transcription_element.findall('text'):
            pron_text = pron_text_element.text.strip()
        if pron_text:
            return xwbi.String(prop_nr="P204", value=pron_text,
                               qualifiers=pron_scheme_qualifiers)  # dmlex pronunciation
        else:
            report_error(entry_id=entry_id, problem="pronunciation transcription text element empty")
            return None


def get_etymology_claim(etymon_element):
    note_text = None
    for note_element in etymon_element.findall('note'):
        note_text = note_element.text.strip()
    for etymunit_element in etymon_element.findall('etymonUnit'):
        for etymunit_text_element in etymunit_element.findall('text'):
            etymunit_text = etymunit_text_element.text.strip()
        if note_text:
            qualifiers = [xwbi.MonolingualText(prop_nr="P226", language=langCode_wiki, text=note_text)] # takes main dict lang as language here
        else:
            qualifiers = []
        return xwbi.String(prop_nr="P227", value=etymunit_text,
                           qualifiers=qualifiers)  # takes for granted that there is no second <etymonUnit> element in this <etymon> element


def write_relation(origin=None, target=None, prop=None, relqid=None):
    if prop == "P214":  # relation to entry
        quali = xwbi.Item(prop_nr="P223", value=relqid)
        claim = xwbi.Lexeme(prop_nr="P214", value=target, qualifiers=[quali])
        subject_lexeme = xwbi.wbi.lexeme.get(entity_id=origin)
        subject_lexeme.claims.add(claim, action_if_exists=xwbi.ActionIfExists.APPEND_OR_REPLACE)
        subject_lexeme.write()
        print(
            f"    ...written relation to https://lexbib.elex.is/entity/{origin} - {prop} - {target}")
        time.sleep(0.8)
    # with open(relations_mappingfile, 'a') as file:
    #     file.write(json.dumps({
    #         "subject": member_roles["reference-origin"],
    #         "prop": relation_prop,
    #         "type": relation_type,
    #         "object": member_roles["reference-target"]
    #     })+"\n")
    else:
        print(
            f"Fatal error: relation prop not implemented in script: {relation_prop}")  # TODO: relations that connect two senses or point from entry to sense
        sys.exit()


def process_inflected_form(form_element=None, lang=None):
    new_form = xwbi.Form()
    inflected_form_rep = None
    for inflected_form_text_element in form_element.findall('text'):
        if inflected_form_rep:
            print(f"Fatal error: There is more than one 'text' element in this 'inflected form' element.")
            report_error(entry_id=entry_id, problem="More than one text element in inflected form")
            continue
        inflected_form_rep = inflected_form_text_element.text.strip()
    print(f"+++ Adding new Form with the representation '{inflected_form_rep}'")
    new_form.representations.set(language=lang, value=inflected_form_rep)
    # inflected form tag as statement to Form
    if 'tag' in form_element.attrib:
        if_tag = inflected_form_element.attrib['tag']
        print(f"+++ +++ Inflected form tag is '{if_tag}'")
        if if_tag not in controldata['inflectedFormTag']:
            print(f"Fatal error: The inflected form tag '{if_tag}' is not a defined controlled value.")
            report_error(entry_id=entry_id, problem=f"inflected form tag '{if_tag}' is not a defined controlled value")
            return None
        if_tag_qid = controldata['inflectedFormTag'][if_tag]
        claim = xwbi.Item(prop_nr="P190", value=if_tag_qid, qualifiers=[xwbi.String(prop_nr="P197", value=if_tag)])
        new_form.claims.add(claim, action_if_exists=xwbi.ActionIfExists.FORCE_APPEND)
    for inflected_form_label_element in form_element.findall('label'):
        label_claim = get_label_claim(inflected_form_label_element)
        if label_claim:
            new_form.claims.add(label_claim)
            print(f"Added inflected form label")
    for pronunciation_label in form_element.findall('pronunciation'):
        pronunciation_claim = get_pronunciation_claim(pronunciation_label)
        if pronunciation_claim:
            new_form.claims.add(pronunciation_claim)
            print(f"Added inflected form pronunciation")
    return new_form


def report_error(entry_id=None, problem=None):
    with open(errorfile, 'a') as outfile:
        outfile.write(entry_id + "\t" + problem + "\n")


# main part of the script

try:
    with open(mappingfile, "r", encoding="utf-8") as metafile:
        controldata = json.load(metafile)
except:
    controldata = {'title': lexicographic_resource.attrib['title'], 'uri': lexicographic_resource.get('uri')}
    if not validators.url(controldata['uri']):
        controldata['uri'] = None

print(controldata)

# define dictionary source language

if 'langCode_wiki' in controldata and 'langCode_item' in controldata:
    pass
else:  # get language data from Wikibase
    if 'langCode' not in lexicographic_resource.attrib:
        print("Fatal error: 'langCode' attribute missing in <lexicographicResource>.")
        sys.exit()
    lang_data = get_langdata(lexicographic_resource.attrib['langCode'])
    controldata['langCode_wiki'] = lang_data['langCode_wiki']
    controldata['langCode_item'] = lang_data['langCode_item']
    dump_controldata(controldata)

langCode_wiki = controldata['langCode_wiki']
langCode_item = controldata['langCode_item']

# define dictionary translation language
# contrary to the dmlex docs, this assumes that the translation language "listing order" is implied in the order of the elements (the "listingOrder" attrib is not there in the dmlex xml source datasets)

if 'trans_langCodes' in controldata:  # assumes that the dict has only one translation language
    for x in controldata['trans_langCodes']:
        trans_lang_code = x['langCode_wiki']
        trans_lang_item = x['langCode_item']
else:
    controldata['trans_langCodes'] = []
    for trans_lang in lexicographic_resource.findall('translationLanguage'):
        if 'langCode' in trans_lang.attrib:
            trans_lang_code = trans_lang.attrib['langCode']
            lang_data = get_langdata(trans_lang_code)
            controldata['trans_langCodes'].append(
                {'langCode_wiki': lang_data['langCode_wiki'],
                 'langCode_item': lang_data['langCode_item']}
            )

# define dictionary item on Wikibase

if "dictionary_item" in controldata:
    dict_qid = controldata['dictionary_item']  # dict already exists on Wikibase
else:  # create item describing dictionary
    labels = [{'lang': langCode_wiki, 'value': controldata['title']}]
    if langCode_wiki != "en":
        labels.append({'lang': 'en', 'value': controldata['title']})
    statements = [
        {'type': 'item', 'prop_nr': 'P5', 'value': 'Q100'},  # instance of dmlex lexicographical resource
        {'type': 'monolingualtext', 'prop_nr': 'P6', 'value': controldata['title'], 'lang': langCode_wiki},
        {'type': 'item', 'prop_nr': 'P150', 'value': controldata['langCode_item']}
    ]
    trans_lang_listpos = 0
    trans_lang_item = None
    for trans_lang in controldata['trans_langCodes']:
        trans_lang_listpos += 1
        statements.append({'type': 'item', 'prop_nr': 'P134', 'value': trans_lang['langCode_item'],
                           'qualifiers': [{'type': 'string', 'prop_nr': 'P33', 'value': str(trans_lang_listpos)}]})
        trans_lang_item = trans_lang[
            'langCode_item']  # assumes that there is only one translang; this will be used for creating headword translation lexemes
    if controldata['uri']:
        statements.append({'type': 'url', 'prop_nr': 'P112', 'value': controldata['uri']})
    itemdata = {'qid': False, 'statements': statements, 'labels': labels}
    dict_qid = xwbi.itemwrite(itemdata)
    print(f"Item describing dictionary has been created: https://lexbib.elex.is/entity/{dict_qid}")
    controldata['dictionary_item'] = dict_qid
    dump_controldata(controldata)
    time.sleep(3)

# process controlled values

controlled_value_groups = {
    'labelTypeTag': 'Q104',
    'labelTag': 'Q103',
    'definitionTypeTag': 'Q101',
    'partOfSpeechTag': 'Q105',
    'transcriptionSchemeTag': 'Q107',
    'inflectedFormTag': 'Q102',
    'sourceIdentityTag': 'Q106'  # if this in the data is not used for a controlled value and is treated as literal
}  # values for 'instance of' in items describing tags

for cv in controlled_value_groups:
    if cv not in controldata:
        controldata[cv] = {}
    for tagname in lexicographic_resource.findall(cv):
        val = tagname.attrib['tag']
        if 'typeTag' in tagname.attrib:
            tag_type = tagname.attrib['typeTag']
            if tag_type not in controldata['labelTypeTag']:
                print(f"Fatal error: Tag type '{tag_type}' is unknown.")
                report_error(entry_id="* Tag Type", problem=f" Tag type '{tag_type}' is unknown.")
                tag_type = None
        else:
            tag_type = None
        descriptions = [{'lang': 'en', 'value': "Tag in " + controldata['title']}]
        for description_element in tagname.findall('description'):
            descriptions = [{'lang': 'en', 'value': description_element.text.strip() + " in " + controldata['title']}]
        print(f"\nWill check value '{val}' of controlled value group '{cv}'...")
        if val in controldata[cv]:
            print(f"Wikibase item for {val} ({cv}) is already there: {controldata[cv][val]}")
        else:
            print(f"Need to create Wikibase item for {val} ({cv})...")
            labels = [{'lang': langCode_wiki, 'value': val}]
            if langCode_wiki != "en":
                labels.append({'lang': 'en', 'value': val})
            statements = [
                {'type': 'item', 'prop_nr': 'P5', 'value': controlled_value_groups[cv]},  # instance of (cv group)
                {'type': 'item', 'prop_nr': 'P207', 'value': dict_qid},
                {'type': 'string', 'prop_nr': 'P197', 'value': val, 'qualifiers': [
                    {'type': 'item', 'prop_nr': 'P207', 'value': dict_qid}
                ]}
            ]
            if tag_type:
                statements.append({'type': 'item', 'prop_nr': 'P191', 'value': controldata['labelTypeTag'][tag_type]})
            itemdata = {'qid': False, 'statements': statements, 'labels': labels, 'descriptions': descriptions}
            val_qid = xwbi.itemwrite(itemdata)
            controldata[cv][val] = val_qid
            dump_controldata(controldata)
            time.sleep(1.5)

# check structure of entries
if 'structure' not in controldata:
    print("\n...Checking XML content...")
    structure = {'lr': {}, 'entry': {}, 'sense': {}}
    for lr_element in lexicographic_resource:
        if lr_element.tag not in structure['lr']:
            structure['lr'][lr_element.tag] = []
        elif lr_element.tag != "entry":
            for attr in lr_element.attrib:
                if attr not in structure['lr'][lr_element.tag]:
                    structure['lr'][lr_element.tag].append(attr)
    entry_count = 0
    for entry_to_check in lexicographic_resource.findall('entry'):
        entry_count += 1
        for element in entry_to_check:
            if element.tag not in structure['entry']:
                structure['entry'][element.tag] = {'attribs': [], 'sub_elements': {}}
            else:
                for attr in element.attrib:
                    if attr not in structure['entry'][element.tag]['attribs']:
                        structure['entry'][element.tag]['attribs'].append(attr)
            actual_sub_elements = {}
            for sub_element in element:
                if sub_element.tag != "sense" and sub_element.tag not in actual_sub_elements:
                    actual_sub_elements[sub_element.tag] = 1
                elif sub_element.tag != "sense":
                    actual_sub_elements[sub_element.tag] += 1
                for sub_element_tag in actual_sub_elements:
                    if sub_element_tag not in structure['entry'][element.tag]['sub_elements']:
                        structure['entry'][element.tag]['sub_elements'][sub_element_tag] = actual_sub_elements[
                            sub_element_tag]
                    elif actual_sub_elements[sub_element_tag] > structure['entry'][element.tag]['sub_elements'][
                        sub_element_tag]:
                        structure['entry'][element.tag]['sub_elements'][sub_element_tag] = actual_sub_elements[
                            sub_element_tag]
            for sense_element_to_check in entry_to_check.findall('sense'):
                for s_element in sense_element_to_check:
                    if s_element.tag not in structure['sense']:
                        structure['sense'][s_element.tag] = []
                    else:
                        for attr in s_element.attrib:
                            if attr not in structure['sense'][s_element.tag]:
                                structure['sense'][s_element.tag].append(attr)
    controldata['number_of_entries'] = entry_count
    controldata['structure'] = structure
    dump_controldata(controldata)

# relation types
if 'relation_types' not in controldata:
    controldata['relation_types'] = {}
for relation_type_element in lexicographic_resource.findall('relationType'):
    relation_type = relation_type_element.attrib['type']
    print(f"Will check relation type '{relation_type}'...")
    if relation_type in controldata['relation_types']:
        print(f"Relation type '{relation_type}' is already there as '{controldata['relation_types'][relation_type]}'.")
    else:
        labels = [{'lang': 'en', 'value': relation_type}]
        descriptions = [{'lang': 'en', 'value': "Relation in " + controldata['title']}]
        for description_element in relation_type_element.findall('description'):
            descriptions = [{'lang': 'en', 'value': description_element.text.strip() + " in " + controldata['title']}]
        rel_class = "Q109"  # default is bidirectional relation type
        rel_prop = "P214"  # default range is Lexeme
        rel_target_type = "entry"  # default target type is entry
        for member_type_element in relation_type_element.findall('memberType'):
            if 'role' in member_type_element.attrib:
                if member_type_element.attrib['role'] == "reference_target":
                    rel_class = "Q110"  # unidirectional relation type
                    rel_target_type = member_type_element.attrib['type']
                    if rel_target_type == "sense":
                        rel_prop = "P215"  # target is sense (not entry)
                    break

        statements = [
            {'type': 'item', 'prop_nr': 'P5', 'value': rel_class},
            # instance of dmlex relation type (unidirectional or bidirectional)
            {'type': 'property', 'prop_nr': 'P216', 'value': rel_prop},
            # prop to use, depending on range Lexeme or Sense
            {'type': 'item', 'prop_nr': 'P207', 'value': dict_qid},
            {'type': 'string', 'prop_nr': 'P197', 'value': relation_type, 'qualifiers': [
                {'type': 'item', 'prop_nr': 'P207', 'value': dict_qid}
            ]}
        ]
        itemdata = {'qid': False, 'statements': statements, 'labels': labels, 'descriptions': descriptions}
        rel_qid = xwbi.itemwrite(itemdata)
        time.sleep(0.5)
        controldata['relation_types'][relation_type] = {'rel_prop': rel_prop, 'rel_qid': rel_qid}
        dump_controldata(controldata)

# process entries
entry_count = 0
number_of_entries = controldata['number_of_entries']
print("\nWill now get mapping of existing entities...")
source_mapping = get_mapping(dict_qid)
input(
    f"\n\nLoaded {len(source_mapping)} known source_id to wikibase mappings. Press ENTER to start to process entries.")

for entry in lexicographic_resource.findall('entry'):
    entry_count += 1
    if 'id' not in entry.attrib:
        print(f"Fatal error: There is no id attribute in this entry element.")
        sys.exit()
    entry_id = entry.attrib['id']
    if entry_id in source_mapping:
        print(f"\n[{entry_count}] Entry '{entry_id}' is already on Wikibase as {source_mapping[entry_id]}")
        continue
    # if "example" in entry_id: # entry of type "example", to be ignored
    #     print(f"Ignored EXAMPLE entry '{entry_id}'.")
    #     continue

    source_mapping[entry_id] = {'lid': None, 'senses': {}}
    # part of speech
    pos_item = "Q108"  # pos 'undefined'
    for part_of_speech in entry.findall('partOfSpeech'):
        if pos_item != "Q108":  # if POS already has been set as not undefined
            print(f"Fatal error: There is more than one 'partofSpeech' element in this entry.")
            report_error(entry_id=entry_id,
                         problem=f"more than one 'partofSpeech' element in this entry. Entry treated as {pos_item} ('{pos_tag}').")
        if 'tag' in part_of_speech.attrib:
            pos_tag = part_of_speech.attrib['tag']
            if pos_tag not in controldata['partOfSpeechTag']:
                print(f"Fatal error: POS tag '{pos}' is not a known controlled value.")
                report_error(entry_id=entry_id, problem=f"POS tag '{pos}' is not a known controlled value")
                sys.exit()
            pos_item = controldata['partOfSpeechTag'][pos_tag]
    print(
        f"\n[{entry_count} done / {number_of_entries - entry_count} left] Now processing entry '{entry_id}' with POS '{pos_item}'")

    lexeme = xwbi.wbi.lexeme.new(language=langCode_item, lexical_category=pos_item)

    # headword
    headword = None
    for headword_element in entry.findall('headword'):
        if headword:  # if headword already has been set
            print(f"Fatal error: There is more than one 'headword' element in this entry.")
            report_error(entry_id=entry_id, problem="There is more than one 'headword' element in this entry")
            continue
        headword = headword_element.text.strip()
        print(f"Found headword: '{headword}'.")
    lexeme.lemmas.set(language=langCode_wiki, value=headword)
    claim = xwbi.Item(prop_nr="P5", value="Q111", qualifiers=[
        xwbi.String(prop_nr="P186", value=entry_id)  # entry id in dmlex source
    ])  # instance of dmlex Entry
    lexeme.claims.add(claim)
    claim = xwbi.Item(prop_nr="P207", value=dict_qid)  # dmlex source dict
    lexeme.claims.add(claim)

    # label at entry level
    for label_element in entry.findall('label'):
        claim = get_label_claim(label_element)
        lexeme.claims.add(claim, action_if_exists=xwbi.ActionIfExists.APPEND_OR_REPLACE)

    # pronunciation at entry level
    for pronunciation_element in entry.findall('pronunciation'):
        claim = get_pronunciation_claim(pronunciation_element)
        lexeme.claims.add(claim, action_if_exists=xwbi.ActionIfExists.APPEND_OR_REPLACE)
        print(f"Processed one pronunciation element and added data to entry.")  # gets transcription text and scheme
    # TODO: possible other sub-elements to 'pronunciation' (other than 'transcription')
    # TODO: possible attributes to 'pronunciation'

    # etymology at entry level
    for etymology_element in entry.findall('etymology'):
        for etymon_element in etymology_element.findall('etymon'):
            claim = get_etymology_claim(etymon_element)
            lexeme.claims.add(claim, action_if_exists=xwbi.ActionIfExists.APPEND_OR_REPLACE)
            print(f"Processed one etymon element and added data to entry.")

    # inflectedForm
    for inflected_form_element in entry.findall('inflectedForm'):
        new_form = process_inflected_form(form_element=inflected_form_element, lang=langCode_wiki)

        lexeme.forms.add(new_form)
        lexeme.write()  # due to bug (adds only the last of the new forms)
        time.sleep(0.50)

    # sense
    headword_translations = {}
    for sense_element in entry.findall('sense'):
        if 'id' not in sense_element.attrib:
            report_error(entry_id=entry_id,
                         problem=f"Sense without sense ID attribute. Sense was skipped.")
            continue
            sense_id_qualifiers = []
            sense_id = None
        else:
            sense_id = sense_element.attrib['id']
            sense_id_qualifiers = [xwbi.String(prop_nr="P186", value=sense_id)]  # sense id in source
        lexeme_sense = xwbi.Sense()
        claim = xwbi.Item(prop_nr="P5", value="Q112", qualifiers=sense_id_qualifiers)  # instance of dmlex Sense
        lexeme_sense.claims.add(claim)
        print("*** Adding new sense...")

        # label at sense level
        for label_element in sense_element.findall('label'):
            claim = get_label_claim(label_element)
            lexeme_sense.claims.add(claim, action_if_exists=xwbi.ActionIfExists.APPEND_OR_REPLACE)

        # definition
        # (sent to glosses in dict main language, and to a P218 statement) # GLOSS now disabled
        gloss = ""
        definition_text_lang = langCode_wiki  # assumes that it is the main language of the dictionary here
        for definition_element in sense_element.findall('definition'):
            if gloss != "":
                gloss += " | "
            for definition_text_element in definition_element.findall('text'):
                definition_text = definition_text_element.text.strip()
            gloss += definition_text
            definition_type_qualifiers = []
            if 'definitionType' in definition_element.attrib:
                definition_type = definition_element.attrib['definitionType']
                if definition_type not in controldata['definitionTypeTag']:
                    print(
                        f"Fatal error: definitionTypeTag '{definition_type}' in Sense '{sense_id}' is not a known controlloed value.")
                    report_error(entry_id=entry_id,
                                 problem=f"DefinitionTypeTag '{definition_type}' in Sense '{sense_id}' is not a known controlloed value.")
                    continue
                def_type_qid = controldata['definitionTypeTag'][definition_type]
                definition_type_qualifiers = [xwbi.Item(prop_nr="P189", value=def_type_qid)]
            claim = xwbi.MonolingualText(prop_nr="P218", text=definition_text, language=definition_text_lang,
                                         qualifiers=definition_type_qualifiers)
            lexeme_sense.claims.add(claim, action_if_exists=xwbi.ActionIfExists.APPEND_OR_REPLACE)
            print(f"Added definition '{definition_text}' for language '{definition_text_lang}'.")
        # lexeme_sense.glosses.set(language=langCode_wiki, value=gloss) # gloss now disabled

        # headword translation
        # (sent to sense glosses and attached as P217 statement)
        headword_trans_lang = controldata['trans_langCodes'][0][
            'langCode_wiki']  # assumes that the main translation language is the language here
        translated_gloss = ""
        headword_translations[sense_id] = []
        seen_translations = []  # to avoid redundant translations (present in the data)
        for headword_trans_element in sense_element.findall('headwordTranslation'):
            if translated_gloss != "":
                translated_gloss += " | "
            trans_sense_label_claims = []
            for label_element in headword_trans_element.findall('label'):
                if 'tag' in label_element.attrib:
                    tag = label_element.attrib['tag']
                    if tag not in controldata['labelTag']:
                        print(
                            f"Fatal error: labelTag '{tag}' in headword translation in Sense '{sense_id}' is not a known controlled value.")
                        report_error(entry_id=entry_id,
                                     problem=f"LabelTag '{tag}' in headword translation in Sense '{sense_id}' is not a known controlled value.")
                        continue
                    tag_qid = controldata['labelTag'][tag]
                    references = xwbi.References()
                    reference_1 = xwbi.Reference()
                    reference_1.add(xwbi.String(prop_nr="P186", value=sense_id))
                    references.add(reference_1)
                    trans_sense_label_claims.append(
                        xwbi.Item(prop_nr="P203", value=tag_qid, qualifiers=[xwbi.String(prop_nr="P197", value=tag)],
                                  references=references))
                # headword_trans_label_qualifiers.append(xwbi.Item(prop_nr="P203", value=tag_qid))
                # headword_trans_label_qualifiers.append(xwbi.String(prop_nr="P197", value=tag))
            trans_forms = []
            for headword_trans_inflected_form_element in headword_trans_element.findall('inflectedForm'):
                new_form = process_inflected_form(form_element=headword_trans_inflected_form_element,
                                                  lang=headword_trans_lang)
                trans_forms.append(new_form)
            # if headword_trans and headword_trans not in seen_translations: # to be written after having got the new wikibase sense ID

            # claim = xwbi.MonolingualText(prop_nr="P217", text=headword_trans, language=headword_trans_lang, qualifiers=headword_trans_label_qualifiers)
            # lexeme_sense.claims.add(claim, action_if_exists=xwbi.ActionIfExists.APPEND_OR_REPLACE)
            headword_trans = None
            for headword_trans_text_element in headword_trans_element.findall('text'):
                headword_trans = headword_trans_text_element.text.strip()
                if headword_trans not in seen_translations:
                    headword_translations[sense_id].append(
                        {'trans': headword_trans, 'label_claims': trans_sense_label_claims, 'trans_forms': trans_forms})
                    seen_translations.append(headword_trans)
                    translated_gloss += headword_trans
                    print(
                        f"Have found headword translation '{headword_trans}' for language '{headword_trans_lang}' in sense '{sense_id}', saving this for writing later, when wikibase sense id is known.")

        if translated_gloss != "":
            lexeme_sense.glosses.set(language=headword_trans_lang, value=translated_gloss)
        else:  # this means that it is a sense without any headword translation
            sense_id = None

        if sense_id:
            lexeme.senses.add(lexeme_sense)

    lexeme.write(clear=False)
    print(f">>> Created new lexeme: https://lexbib.elex.is/wiki/Lexeme:{lexeme.id}")
    time.sleep(0.5)
    source_mapping[entry_id] = lexeme.id

    # get senses ID mapping after writing (when sense ids are assigned)
    senses_count = 0
    for wikibase_sense in lexeme.senses.get_json():
        senses_count += 1
        sense_id = str(senses_count)  # if no proper sense_id is found (because the source has no sense id)
        for p5_claim in wikibase_sense['claims']['P5']:  # looks for proper sense id
            if 'P186' in p5_claim['qualifiers']:
                sense_id = p5_claim['qualifiers']['P186'][0]['datavalue']['value']
                source_mapping[sense_id] = wikibase_sense['id']

    # examples now, after knowing the sense id mapping
    for sense_element in entry.findall('sense'):
        if 'id' not in sense_element.attrib:
            sense_id = None
            sense_lid = None
        else:
            sense_id = sense_element.attrib['id']
            if sense_id in source_mapping:
                sense_lid = source_mapping[sense_id]
            else:
                sense_lid = None
                print(f"Dummy sense '{sense_id}': sense id {sense_id} not found in mapping data.")
                time.sleep(1)
        for example_element in sense_element.findall('example'):
            print("... Adding new example")
            qualifiers = []
            if sense_lid:
                qualifiers.append(xwbi.SenseClaim(prop_nr="P211", value=sense_lid))
            example_text = None
            for example_text_element in example_element.findall('text'):
                example_text = re.sub(r'<[^>]+>', '', ET.tostring(example_text_element, encoding="unicode")).strip()

            # example headword and collocate markers
            for headword_marker_element in example_text_element.findall('headwordMarker'):
                qualifiers.append(xwbi.String(prop_nr="P219", value=headword_marker_element.text.strip()))
            for collocate_marker_element in example_text_element.findall('collocateMarker'):
                qualifiers.append(xwbi.String(prop_nr="P220", value=collocate_marker_element.text.strip()))

            # example translation
            for example_trans_element in example_element.findall('exampleTranslation'):
                for example_trans_text_element in example_trans_element.findall('text'):
                    example_trans_text = example_trans_text_element.text.strip()
                expl_trans_lang = controldata['trans_langCodes'][0]['langCode_wiki']
                # this takes the first listed translation language
                # TODO: ensure that there is only one translation language (and, if not, assign the correct translation language)
                qualifiers.append(
                    xwbi.MonolingualText(prop_nr="P210", text=example_trans_text, language=expl_trans_lang))

            # source identity tag
            if 'sourceIdentity' in example_element.attrib:
                source_identity = example_element.attrib['sourceIdentity']
                if source_identity not in controldata['sourceIdentityTag']:
                    print(
                        f"Fatal error: source identity '{source_identity}' of example in sense '{sense_id}' is not a known controlled value.")
                    report_error(entry_id=entry_id,
                                 problem=f"source identity '{source_identity}' of example in sense '{sense_id}' is not a known controlled value.")
                else:
                    qualifiers.append(
                        xwbi.Item(prop_nr="P193", value=controldata['sourceIdentityTag'][source_identity]))
            # variant for literal not item:
            # qualifiers.append(xwbi.String(prop_nr="P221", value=example_element.attrib['sourceIdentity']))
            # source elaboration comment attribute
            if 'sourceElaboration' in example_element.attrib:
                qualifiers.append(xwbi.String(prop_nr="199", value=example_element.attrib['sourceElaboration']))

            claim = xwbi.MonolingualText(prop_nr="P213", text=example_text, language=langCode_wiki,
                                         qualifiers=qualifiers)
            lexeme.claims.add(claim, action_if_exists=xwbi.ActionIfExists.APPEND_OR_REPLACE)
    lexeme.write()
    print("Finished adding examples.")
    time.sleep(0.5)

    # headword translations now, after knowing the sense id mapping
    for sense_id in headword_translations:
        for translation in headword_translations[sense_id]:
            print(f"Processing a headword translation object: '{translation['trans']}'")
            # create lexeme in translation target language with a sense
            trans_lexeme = xwbi.wbi.lexeme.new(language=trans_lang_item,
                                               lexical_category=pos_item)  # POS inherited from source language entry
            trans_lexeme.lemmas.set(language=headword_trans_lang, value=translation['trans'])
            claim = xwbi.Item(prop_nr="P5", value="Q113")  # a dmlex headword translation object
            trans_lexeme.claims.add(claim)
            claim = xwbi.Item(prop_nr="P207", value=dict_qid)  # dmlex source dict
            trans_lexeme.claims.add(claim)
            for label_claim in translation['label_claims']:
                trans_lexeme.claims.add(label_claim)
            trans_sense = xwbi.Sense()
            trans_sense.glosses.set(language=langCode_wiki, value=headword)
            claim = xwbi.SenseClaim(prop_nr="P225",
                                    value=source_mapping[sense_id])  # link from target sense to origin sense
            trans_sense.claims.add(claim)
            trans_lexeme.senses.add(trans_sense)
            if len(translation['trans_forms']) > 0:
                for trans_form in translation['trans_forms']:
                    print(f"Processing a headword translation form: '{trans_form}'")
                    trans_lexeme.forms.add(trans_form)
                    trans_lexeme.write()
                    print(f"Added inflected form to translation lexeme for translation '{translation['trans']}'")
                    time.sleep(0.5)
            else:
                trans_lexeme.write()
                print(
                    f"Created new headword translation lexeme in language '{headword_trans_lang}': '{translation['trans']}', https://lexbib.elex.is/wiki/Lexeme:{trans_lexeme.id}")
                time.sleep(1)
            # write link pointing from translation origin sense to new translation target sense
            trans_sense_lid = trans_lexeme.id + "-S1"
            origin_sense_lid = source_mapping[sense_id]
            claimid = xwb.senseclaim(origin_sense_lid, "P224", trans_sense_lid)
            time.sleep(0.15)
            xwb.setqualifier(origin_sense_lid, "P224", claimid, "P217",
                             {"language": trans_lang_code, "text": translation['trans']}, "monolingualtext",
                             replace=False)

    print(
        f"\nFinished processing entry '{entry_id}', now on Wikibase, see it at 'https://lexbib.elex.is/wiki/Lexeme:{lexeme.id}'.\n")
    time.sleep(1)

# input("ENTER to proceed")

# add relations
print(f"\n Starting to add relations...")
relation_count = 0
for relation in lexicographic_resource.findall('relation'):
    relation_count += 1
    relation_type = relation.attrib['type']
    if relation_type not in controldata['relation_types']:
        print(f"Fatal error: relation type {relation_type} is not a known controlled value.")
        report_error(entry_id="* Relation", problem=f"Relation type {relation_type} is not a known controlled value.")
        continue
    relation_prop = controldata['relation_types'][relation_type]['rel_prop']
    relation_qid = controldata['relation_types'][relation_type]['rel_qid']
    member_roles = {"reference-origin": None, "reference-target": None, "bidirectional": []}
    for member in relation.findall('member'):
        if 'role' not in member.attrib:
            member_roles['bidirectional'].append(member.attrib['ref'])
        elif member.attrib['role'] in member_roles:
            member_roles[member.attrib['role']] = member.attrib['ref']
    if len(member_roles['bidirectional']) == 2 and "example" not in str(member_roles):
        print(
            f"[{relation_count}] Will write bidirectional relation to: {member_roles['bidirectional']}, relation type is '{relation_type}'.")
        write_relation(origin=source_mapping[member_roles['bidirectional'][0]],
                       target=source_mapping[member_roles['bidirectional'][1]], prop=relation_prop, relqid=relation_qid)
        write_relation(origin=source_mapping[member_roles['bidirectional'][1]],
                       target=source_mapping[member_roles['bidirectional'][0]], prop=relation_prop, relqid=relation_qid)
    elif member_roles["reference-origin"] and member_roles["reference-target"]:
        print(
            f"[{relation_count}] Will write unidirectional relation to: '{member_roles['reference-origin']}' with target '{member_roles['reference-target']}', relation type is '{relation_type}'.")
        write_relation(origin=source_mapping[member_roles['reference-origin']],
                       target=source_mapping[member_roles['reference-target']], prop=relation_prop, relqid=relation_qid)
    else:
        print(f"[{relation_count}] Fatal error: relation data not valid: {member_roles}")
        report_error(entry_id="* Relation",
                     problem=f"Relation data not valid and left unwritten: {member_roles}, {ET.tostring(relation, encoding='utf8')}")