Raymund Zacharias commited on 2017-08-14 10:59:04
Zeige 3 geänderte Dateien mit 257 Einfügungen und 0 Löschungen.
| ... | ... |
@@ -0,0 +1,125 @@ |
| 1 |
+<?xml version="1.0"?> |
|
| 2 |
+<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.10/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.10/ http://www.mediawiki.org/xml/export-0.10.xsd" version="0.10" xml:lang="en"> |
|
| 3 |
+ <siteinfo> |
|
| 4 |
+ <sitename>Wiktionary</sitename> |
|
| 5 |
+ <dbname>enwiktionary</dbname> |
|
| 6 |
+ <base>https://en.wiktionary.org/wiki/Wiktionary:Main_Page</base> |
|
| 7 |
+ <generator>MediaWiki 1.30.0-wmf.13</generator> |
|
| 8 |
+ <case>case-sensitive</case> |
|
| 9 |
+ <namespaces> |
|
| 10 |
+ <namespace key="-2" case="case-sensitive">Media</namespace> |
|
| 11 |
+ <namespace key="-1" case="first-letter">Special</namespace> |
|
| 12 |
+ <namespace key="0" case="case-sensitive" /> |
|
| 13 |
+ <namespace key="1" case="case-sensitive">Talk</namespace> |
|
| 14 |
+ <namespace key="2" case="first-letter">User</namespace> |
|
| 15 |
+ <namespace key="3" case="first-letter">User talk</namespace> |
|
| 16 |
+ <namespace key="4" case="case-sensitive">Wiktionary</namespace> |
|
| 17 |
+ <namespace key="5" case="case-sensitive">Wiktionary talk</namespace> |
|
| 18 |
+ <namespace key="6" case="case-sensitive">File</namespace> |
|
| 19 |
+ <namespace key="7" case="case-sensitive">File talk</namespace> |
|
| 20 |
+ <namespace key="8" case="first-letter">MediaWiki</namespace> |
|
| 21 |
+ <namespace key="9" case="first-letter">MediaWiki talk</namespace> |
|
| 22 |
+ <namespace key="10" case="case-sensitive">Template</namespace> |
|
| 23 |
+ <namespace key="11" case="case-sensitive">Template talk</namespace> |
|
| 24 |
+ <namespace key="12" case="case-sensitive">Help</namespace> |
|
| 25 |
+ <namespace key="13" case="case-sensitive">Help talk</namespace> |
|
| 26 |
+ <namespace key="14" case="case-sensitive">Category</namespace> |
|
| 27 |
+ <namespace key="15" case="case-sensitive">Category talk</namespace> |
|
| 28 |
+ <namespace key="90" case="case-sensitive">Thread</namespace> |
|
| 29 |
+ <namespace key="91" case="case-sensitive">Thread talk</namespace> |
|
| 30 |
+ <namespace key="92" case="case-sensitive">Summary</namespace> |
|
| 31 |
+ <namespace key="93" case="case-sensitive">Summary talk</namespace> |
|
| 32 |
+ <namespace key="100" case="case-sensitive">Appendix</namespace> |
|
| 33 |
+ <namespace key="101" case="case-sensitive">Appendix talk</namespace> |
|
| 34 |
+ <namespace key="102" case="case-sensitive">Concordance</namespace> |
|
| 35 |
+ <namespace key="103" case="case-sensitive">Concordance talk</namespace> |
|
| 36 |
+ <namespace key="104" case="case-sensitive">Index</namespace> |
|
| 37 |
+ <namespace key="105" case="case-sensitive">Index talk</namespace> |
|
| 38 |
+ <namespace key="106" case="case-sensitive">Rhymes</namespace> |
|
| 39 |
+ <namespace key="107" case="case-sensitive">Rhymes talk</namespace> |
|
| 40 |
+ <namespace key="108" case="case-sensitive">Transwiki</namespace> |
|
| 41 |
+ <namespace key="109" case="case-sensitive">Transwiki talk</namespace> |
|
| 42 |
+ <namespace key="110" case="case-sensitive">Wikisaurus</namespace> |
|
| 43 |
+ <namespace key="111" case="case-sensitive">Wikisaurus talk</namespace> |
|
| 44 |
+ <namespace key="114" case="case-sensitive">Citations</namespace> |
|
| 45 |
+ <namespace key="115" case="case-sensitive">Citations talk</namespace> |
|
| 46 |
+ <namespace key="116" case="case-sensitive">Sign gloss</namespace> |
|
| 47 |
+ <namespace key="117" case="case-sensitive">Sign gloss talk</namespace> |
|
| 48 |
+ <namespace key="118" case="case-sensitive">Reconstruction</namespace> |
|
| 49 |
+ <namespace key="119" case="case-sensitive">Reconstruction talk</namespace> |
|
| 50 |
+ <namespace key="828" case="case-sensitive">Module</namespace> |
|
| 51 |
+ <namespace key="829" case="case-sensitive">Module talk</namespace> |
|
| 52 |
+ <namespace key="2300" case="case-sensitive">Gadget</namespace> |
|
| 53 |
+ <namespace key="2301" case="case-sensitive">Gadget talk</namespace> |
|
| 54 |
+ <namespace key="2302" case="case-sensitive">Gadget definition</namespace> |
|
| 55 |
+ <namespace key="2303" case="case-sensitive">Gadget definition talk</namespace> |
|
| 56 |
+ <namespace key="2600" case="first-letter">Topic</namespace> |
|
| 57 |
+ </namespaces> |
|
| 58 |
+ </siteinfo> |
|
| 59 |
+ <page> |
|
| 60 |
+ <title>Reconstruction:Proto-Germanic/hungruz</title> |
|
| 61 |
+ <ns>118</ns> |
|
| 62 |
+ <id>3931063</id> |
|
| 63 |
+ <revision> |
|
| 64 |
+ <id>46910871</id> |
|
| 65 |
+ <parentid>42165964</parentid> |
|
| 66 |
+ <timestamp>2017-06-29T17:01:25Z</timestamp> |
|
| 67 |
+ <contributor> |
|
| 68 |
+ <username>Kennybot</username> |
|
| 69 |
+ <id>1464621</id> |
|
| 70 |
+ </contributor> |
|
| 71 |
+ <minor/> |
|
| 72 |
+ <comment>/* Declension */replaced: {{gem-decl-noun-u-mf|hungr}} → {{gem-decl-noun}} using [[Project:AWB|AWB]]</comment>
|
|
| 73 |
+ <model>wikitext</model> |
|
| 74 |
+ <format>text/x-wiki</format> |
|
| 75 |
+ <text xml:space="preserve" bytes="1564">{{reconstructed}}
|
|
| 76 |
+==Proto-Germanic== |
|
| 77 |
+ |
|
| 78 |
+===Alternative forms=== |
|
| 79 |
+* {{l|gem-pro|*hunhruz}}
|
|
| 80 |
+ |
|
| 81 |
+===Etymology=== |
|
| 82 |
+From earlier {{m|gem-pro|*hunhruz}}, from {{etyl|ine-pro|gem-pro}} {{m|ine-pro|*kank-}}, {{m|ine-pro|*kenk-||to burn, dry, pain, desire, hunger, thirst}}. Cognate with {{cog|lt|kenkti||to damage, blight}}, {{cog|sa|काङ्क्षति||he wishes, desires}}.
|
|
| 83 |
+ |
|
| 84 |
+===Noun=== |
|
| 85 |
+{{gem-noun|m}}
|
|
| 86 |
+ |
|
| 87 |
+# [[hunger]] |
|
| 88 |
+ |
|
| 89 |
+====Declension==== |
|
| 90 |
+{{gem-decl-noun}}
|
|
| 91 |
+ |
|
| 92 |
+====Derived terms==== |
|
| 93 |
+* {{l|gem-pro|*hungrijaną}}
|
|
| 94 |
+* {{l|gem-pro|*hungrugaz}}/{{l|gem-pro|*hungragaz}}
|
|
| 95 |
+ |
|
| 96 |
+====Descendants==== |
|
| 97 |
+* Old English: {{l|ang|hungor}}
|
|
| 98 |
+** Middle English: {{l|enm|hunger}}, {{l|enm|honger}}, {{l|enm|hounger}}
|
|
| 99 |
+*** Scots: {{l|sco|hounger}}, {{l|sco|hunger}}
|
|
| 100 |
+*** English: {{l|en|hunger}}
|
|
| 101 |
+* Old Frisian: {{l|ofs|hunger}}
|
|
| 102 |
+** Saterland Frisian: ({{l|stq|Geehunger}})
|
|
| 103 |
+** West Frisian: {{l|fy|honger}}, {{l|fy|hûnger}}
|
|
| 104 |
+* Old Saxon: {{l|osx|hungar}}
|
|
| 105 |
+** Middle Low German: {{l|gml|hunger}}
|
|
| 106 |
+*** Low German: {{l|nds-de|Hunger}}
|
|
| 107 |
+* Old Dutch: {{l|odt|*hungar}}, {{l|odt|hunger}}
|
|
| 108 |
+** Middle Dutch: {{l|dum|hunger}}, {{l|dum|honger}}
|
|
| 109 |
+*** Dutch: {{l|nl|honger}}
|
|
| 110 |
+* Old High German: {{l|goh|hungar}}
|
|
| 111 |
+** Middle High German: {{l|gmh|hunger}}
|
|
| 112 |
+*** German: {{l|de|Hunger}}
|
|
| 113 |
+*** Luxembourgish: {{l|lb|Honger}}
|
|
| 114 |
+*** Yiddish: {{l|yi|הונגער}}
|
|
| 115 |
+* Old Norse: {{l|non|hungr}}
|
|
| 116 |
+** Icelandic: {{l|is|hungur}}
|
|
| 117 |
+** Norwegian: {{l|no|hunger}}
|
|
| 118 |
+** Old Swedish: {{l|gmq-osw|hunger}}
|
|
| 119 |
+*** Swedish: {{l|sv|hunger}}
|
|
| 120 |
+** Danish: {{l|da|hunger}}
|
|
| 121 |
+* Gothic: *hunhrus > {{l|got|𐌷𐌿𐌷𐍂𐌿𐍃|tr=hūhrus}}, *huggrus (in derivatives)</text>
|
|
| 122 |
+ <sha1>67s06inul2cv24rnf7g5vsvzun07ati</sha1> |
|
| 123 |
+ </revision> |
|
| 124 |
+ </page> |
|
| 125 |
+</mediawiki> |
| ... | ... |
@@ -0,0 +1,51 @@ |
| 1 |
+__author__ = 'Ray' |
|
| 2 |
+ |
|
| 3 |
+import sys |
|
| 4 |
+import io |
|
| 5 |
+import re |
|
| 6 |
+import urllib.request |
|
| 7 |
+import urllib.parse |
|
| 8 |
+import xml.etree.ElementTree as ET |
|
| 9 |
+ |
|
| 10 |
+ |
|
| 11 |
+def get_descendants(title): |
|
| 12 |
+ """Get and parse the descendant section from en.wiktionary.org for a specific title (usually word)""" |
|
| 13 |
+ request_parameters = {
|
|
| 14 |
+ "action": "query", |
|
| 15 |
+ "format": "xml", |
|
| 16 |
+ "export": 1, |
|
| 17 |
+ "exportnowrap": 1, |
|
| 18 |
+ "titles": title, |
|
| 19 |
+ "redirects": 1 |
|
| 20 |
+ } |
|
| 21 |
+ request_data = urllib.parse.urlencode(request_parameters) |
|
| 22 |
+ request_data = request_data.encode('utf8')
|
|
| 23 |
+ wikipage_request = urllib.request.Request("https://en.wiktionary.org/w/api.php/", request_data)
|
|
| 24 |
+ with urllib.request.urlopen(wikipage_request) as wikipage_response: |
|
| 25 |
+ # with open("Wiktionary-20170813141826.xml", "r", encoding="utf-8") as wikipage_response:
|
|
| 26 |
+ wikipage_string = wikipage_response.read() |
|
| 27 |
+ root = ET.fromstring(wikipage_string) |
|
| 28 |
+ ns = {"mwns": "http://www.mediawiki.org/xml/export-0.10/"}
|
|
| 29 |
+ # wikipage_text_string = root.find("./mwns:page/mwns:title", ns).text
|
|
| 30 |
+ wikipage_text_string = root.find("./mwns:page/mwns:revision/mwns:text", ns).text
|
|
| 31 |
+ # ET.dump(root) |
|
| 32 |
+ # print(wikipage_text_string) |
|
| 33 |
+ match = re.search(r"\=+Descendants\=+", wikipage_text_string) |
|
| 34 |
+ descendants_string = wikipage_text_string[match.end():-1] |
|
| 35 |
+ descendant_words = [] |
|
| 36 |
+ # {{l|ang|hungor}} start_tag = "{{" end_tag = "}}" parameter_tag = "|"
|
|
| 37 |
+ descendants_buffer = io.StringIO(descendants_string) |
|
| 38 |
+ for line in descendants_buffer: |
|
| 39 |
+ matches = re.findall(r"(?<={{).*?(?=}})",
|
|
| 40 |
+ line) # find all matches for every string between {{ and }} in a non-greedy manner
|
|
| 41 |
+ for match in matches: |
|
| 42 |
+ match_split = match.split("|")
|
|
| 43 |
+ if (match_split[0] == "l"): |
|
| 44 |
+ descendant_words.append({"language": match_split[1], "lemma": match_split[2]})
|
|
| 45 |
+ return descendant_words |
|
| 46 |
+ |
|
| 47 |
+ |
|
| 48 |
+def get_PG_decendants(word): |
|
| 49 |
+ """Get and parse the descendant section from en.wiktionary.org for a specific gem-pro word""" |
|
| 50 |
+ pg_prefix = "Reconstruction:Proto-Germanic/" |
|
| 51 |
+ return get_descendants(pg_prefix + word) |
| ... | ... |
@@ -0,0 +1,81 @@ |
| 1 |
+# encoding: utf-8 |
|
| 2 |
+ |
|
| 3 |
+import sys |
|
| 4 |
+from difflib import SequenceMatcher |
|
| 5 |
+import io |
|
| 6 |
+import re |
|
| 7 |
+import argparse |
|
| 8 |
+import urllib.request |
|
| 9 |
+import urllib.parse |
|
| 10 |
+import xml.etree.ElementTree as ET |
|
| 11 |
+from descendants import get_PG_decendants, get_descendants |
|
| 12 |
+ |
|
| 13 |
+globalvars = {
|
|
| 14 |
+ "escape": False, |
|
| 15 |
+ "data": {
|
|
| 16 |
+ "words": [ |
|
| 17 |
+ {"id": 0, "language": "gem", "lemma": "bītaną"},
|
|
| 18 |
+ {"id": 1, "language": "gem", "lemma": "hrītaną"},
|
|
| 19 |
+ {"id": 2, "language": "deu", "lemma": "beißen"},
|
|
| 20 |
+ {"id": 3, "language": "deu", "lemma": "reißen"},
|
|
| 21 |
+ {"id": 4, "language": "nld", "lemma": "bijten"},
|
|
| 22 |
+ {"id": 5, "language": "nld", "lemma": "rijten"},
|
|
| 23 |
+ {"id": 6, "language": "deu", "lemma": "scheißen"},
|
|
| 24 |
+ {"id": 7, "language": "gem", "lemma": "skītaną"},
|
|
| 25 |
+ {"id": 8, "language": "nld", "lemma": "schijten"},
|
|
| 26 |
+ ], |
|
| 27 |
+ "origins": [ |
|
| 28 |
+ {"from": 0, "to": 2},
|
|
| 29 |
+ {"from": 1, "to": 3},
|
|
| 30 |
+ {"from": 0, "to": 4},
|
|
| 31 |
+ {"from": 1, "to": 5},
|
|
| 32 |
+ {"from": 7, "to": 6},
|
|
| 33 |
+ {"from": 7, "to": 8},
|
|
| 34 |
+ ] |
|
| 35 |
+ } |
|
| 36 |
+} |
|
| 37 |
+ |
|
| 38 |
+ |
|
| 39 |
+def myformat(string): |
|
| 40 |
+ global globalvars |
|
| 41 |
+ return (string.encode("unicode_escape") if globalvars["escape"] else string)
|
|
| 42 |
+ |
|
| 43 |
+ |
|
| 44 |
+def compare(wordpair): |
|
| 45 |
+ """Compare two words of the same language and find the biggest matching block. |
|
| 46 |
+ e.g. PG bītaną and hrītaną share "ītaną" |
|
| 47 |
+ """ |
|
| 48 |
+ analyzation = SequenceMatcher(None, wordpair[0]["lemma"], wordpair[1]["lemma"]) |
|
| 49 |
+ print("\ncomparing '%s' and '%s' ..." % (myformat(wordpair[0]["lemma"]), myformat(wordpair[1]["lemma"])))
|
|
| 50 |
+ blocks = analyzation.get_matching_blocks() |
|
| 51 |
+ for block in blocks: |
|
| 52 |
+ print("\tcurrent block: %s" % (str(block)))
|
|
| 53 |
+ for index in range(len(wordpair)): |
|
| 54 |
+ word = wordpair[index] |
|
| 55 |
+ size = block[2] |
|
| 56 |
+ if (size > 0): |
|
| 57 |
+ begin = block[index] |
|
| 58 |
+ end = begin+size |
|
| 59 |
+ print("\t\t'%s'[%u:%u] = %s" % (myformat(word["lemma"]), begin, end, myformat(word["lemma"][begin:end])))
|
|
| 60 |
+ |
|
| 61 |
+ |
|
| 62 |
+def main(args): |
|
| 63 |
+ global globalvars |
|
| 64 |
+ parser = argparse.ArgumentParser() |
|
| 65 |
+ parser.add_argument("--escape", dest="escape", action="store_const", const=sum, default=False, help="enable escape")
|
|
| 66 |
+ parameters = parser.parse_args(args) |
|
| 67 |
+ globalvars["escape"] = parameters.escape |
|
| 68 |
+ if (len(args) > 0): |
|
| 69 |
+ escape = False |
|
| 70 |
+ for i in range(0, len(globalvars["data"]["words"])): |
|
| 71 |
+ word1 = globalvars["data"]["words"][i] |
|
| 72 |
+ for j in range(i+1, len(globalvars["data"]["words"])): |
|
| 73 |
+ word2 = globalvars["data"]["words"][j] |
|
| 74 |
+ if (word1["language"] == word2["language"]): #if languages match |
|
| 75 |
+ wordpair = (word1, word2) |
|
| 76 |
+ compare(wordpair) #compare the pair |
|
| 77 |
+ result = list(map(lambda word: {"language": word["language"], "lemma": myformat(word["lemma"])}, get_PG_decendants("hrītaną")))
|
|
| 78 |
+ print(str(result)) |
|
| 79 |
+ |
|
| 80 |
+main(sys.argv[1:]) |
|
| 81 |
+ |
|
| 0 | 82 |