git.schokokeks.org
Repositories
Help
Report an Issue
fs-workbench.git
Code
Commits
Branches
Tags
Suche
Strukturansicht:
a08c3c7
Branches
Tags
master
fs-workbench.git
source
tools
correlation
ancestor.py
refactored lookup code to avoid duplication
Raymund Zacharias
commited
a08c3c7
at 2017-08-19 02:39:18
ancestor.py
Blame
History
Raw
import re from mediawiki import get_wikipage_text def get_ancestor(title, language_code, ancestor_language_code): """Get and parse the descendant section from en.wiktionary.org for a specific title (usually word)""" wikipage_text_string = get_wikipage_text(title) match = re.search(r"\=+Etymology.*?\=+", wikipage_text_string) etymology_string = wikipage_text_string[match.end():-1] ancestor_words = [] # {{l|ang|hungor}} start_tag = "{{" end_tag = "}}" parameter_tag = "|" matches = re.findall(r"(?<={{).*?(?=}})", etymology_string) # find all matches for every string between {{ and }} in a non-greedy manner for id, match in enumerate(matches): parameters = match.split("|") # {{etyl|gem-pro|en}} {{m|gem-pro|*sagô}} if (parameters[0] == "etyl") and (parameters[1] == ancestor_language_code) and (parameters[2] == language_code): if id < len(matches): next_match = matches[id + 1] nm_parameters = next_match.split("|") if (nm_parameters[0] == "m") and (nm_parameters[1] == ancestor_language_code): ancestor_words.append({"language": nm_parameters[1], "lemma": nm_parameters[2]}) # {{inh|en|gem-pro|*hwītaz}} if (parameters[0] == "inh") and (parameters[1] == language_code) and (parameters[2] == ancestor_language_code): ancestor_words.append({"language": parameters[2], "lemma": parameters[3]}) # {{der|it|la|alacer||lively; happy, cheerful}} if (parameters[0] == "der") and (parameters[1] == language_code) and (parameters[2] == ancestor_language_code): ancestor_words.append({"language": parameters[2], "lemma": parameters[3]}) return ancestor_words