Browse code

separated the descendants acquisition logic from test.py and implemented actual fetching of wiktionary data

Raymund Zacharias authored on14/08/2017 10:59:04
Showing3 changed files
1 1
new file mode 100644
... ...
@@ -0,0 +1,125 @@
1
+<?xml version="1.0"?>
2
+<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.10/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.10/ http://www.mediawiki.org/xml/export-0.10.xsd" version="0.10" xml:lang="en">
3
+  <siteinfo>
4
+    <sitename>Wiktionary</sitename>
5
+    <dbname>enwiktionary</dbname>
6
+    <base>https://en.wiktionary.org/wiki/Wiktionary:Main_Page</base>
7
+    <generator>MediaWiki 1.30.0-wmf.13</generator>
8
+    <case>case-sensitive</case>
9
+    <namespaces>
10
+      <namespace key="-2" case="case-sensitive">Media</namespace>
11
+      <namespace key="-1" case="first-letter">Special</namespace>
12
+      <namespace key="0" case="case-sensitive" />
13
+      <namespace key="1" case="case-sensitive">Talk</namespace>
14
+      <namespace key="2" case="first-letter">User</namespace>
15
+      <namespace key="3" case="first-letter">User talk</namespace>
16
+      <namespace key="4" case="case-sensitive">Wiktionary</namespace>
17
+      <namespace key="5" case="case-sensitive">Wiktionary talk</namespace>
18
+      <namespace key="6" case="case-sensitive">File</namespace>
19
+      <namespace key="7" case="case-sensitive">File talk</namespace>
20
+      <namespace key="8" case="first-letter">MediaWiki</namespace>
21
+      <namespace key="9" case="first-letter">MediaWiki talk</namespace>
22
+      <namespace key="10" case="case-sensitive">Template</namespace>
23
+      <namespace key="11" case="case-sensitive">Template talk</namespace>
24
+      <namespace key="12" case="case-sensitive">Help</namespace>
25
+      <namespace key="13" case="case-sensitive">Help talk</namespace>
26
+      <namespace key="14" case="case-sensitive">Category</namespace>
27
+      <namespace key="15" case="case-sensitive">Category talk</namespace>
28
+      <namespace key="90" case="case-sensitive">Thread</namespace>
29
+      <namespace key="91" case="case-sensitive">Thread talk</namespace>
30
+      <namespace key="92" case="case-sensitive">Summary</namespace>
31
+      <namespace key="93" case="case-sensitive">Summary talk</namespace>
32
+      <namespace key="100" case="case-sensitive">Appendix</namespace>
33
+      <namespace key="101" case="case-sensitive">Appendix talk</namespace>
34
+      <namespace key="102" case="case-sensitive">Concordance</namespace>
35
+      <namespace key="103" case="case-sensitive">Concordance talk</namespace>
36
+      <namespace key="104" case="case-sensitive">Index</namespace>
37
+      <namespace key="105" case="case-sensitive">Index talk</namespace>
38
+      <namespace key="106" case="case-sensitive">Rhymes</namespace>
39
+      <namespace key="107" case="case-sensitive">Rhymes talk</namespace>
40
+      <namespace key="108" case="case-sensitive">Transwiki</namespace>
41
+      <namespace key="109" case="case-sensitive">Transwiki talk</namespace>
42
+      <namespace key="110" case="case-sensitive">Wikisaurus</namespace>
43
+      <namespace key="111" case="case-sensitive">Wikisaurus talk</namespace>
44
+      <namespace key="114" case="case-sensitive">Citations</namespace>
45
+      <namespace key="115" case="case-sensitive">Citations talk</namespace>
46
+      <namespace key="116" case="case-sensitive">Sign gloss</namespace>
47
+      <namespace key="117" case="case-sensitive">Sign gloss talk</namespace>
48
+      <namespace key="118" case="case-sensitive">Reconstruction</namespace>
49
+      <namespace key="119" case="case-sensitive">Reconstruction talk</namespace>
50
+      <namespace key="828" case="case-sensitive">Module</namespace>
51
+      <namespace key="829" case="case-sensitive">Module talk</namespace>
52
+      <namespace key="2300" case="case-sensitive">Gadget</namespace>
53
+      <namespace key="2301" case="case-sensitive">Gadget talk</namespace>
54
+      <namespace key="2302" case="case-sensitive">Gadget definition</namespace>
55
+      <namespace key="2303" case="case-sensitive">Gadget definition talk</namespace>
56
+      <namespace key="2600" case="first-letter">Topic</namespace>
57
+    </namespaces>
58
+  </siteinfo>
59
+  <page>
60
+    <title>Reconstruction:Proto-Germanic/hungruz</title>
61
+    <ns>118</ns>
62
+    <id>3931063</id>
63
+    <revision>
64
+      <id>46910871</id>
65
+      <parentid>42165964</parentid>
66
+      <timestamp>2017-06-29T17:01:25Z</timestamp>
67
+      <contributor>
68
+        <username>Kennybot</username>
69
+        <id>1464621</id>
70
+      </contributor>
71
+      <minor/>
72
+      <comment>/* Declension */replaced: {{gem-decl-noun-u-mf|hungr}} → {{gem-decl-noun}} using [[Project:AWB|AWB]]</comment>
73
+      <model>wikitext</model>
74
+      <format>text/x-wiki</format>
75
+      <text xml:space="preserve" bytes="1564">{{reconstructed}}
76
+==Proto-Germanic==
77
+
78
+===Alternative forms===
79
+* {{l|gem-pro|*hunhruz}}
80
+
81
+===Etymology===
82
+From earlier {{m|gem-pro|*hunhruz}}, from {{etyl|ine-pro|gem-pro}} {{m|ine-pro|*kank-}}, {{m|ine-pro|*kenk-||to burn, dry, pain, desire, hunger, thirst}}. Cognate with {{cog|lt|kenkti||to damage, blight}}, {{cog|sa|काङ्क्षति||he wishes, desires}}.
83
+
84
+===Noun===
85
+{{gem-noun|m}}
86
+
87
+# [[hunger]]
88
+
89
+====Declension====
90
+{{gem-decl-noun}}
91
+
92
+====Derived terms====
93
+* {{l|gem-pro|*hungrijaną}}
94
+* {{l|gem-pro|*hungrugaz}}/{{l|gem-pro|*hungragaz}}
95
+
96
+====Descendants====
97
+* Old English: {{l|ang|hungor}}
98
+** Middle English: {{l|enm|hunger}}, {{l|enm|honger}}, {{l|enm|hounger}}
99
+*** Scots: {{l|sco|hounger}}, {{l|sco|hunger}}
100
+*** English: {{l|en|hunger}}
101
+* Old Frisian: {{l|ofs|hunger}}
102
+** Saterland Frisian: ({{l|stq|Geehunger}})
103
+** West Frisian: {{l|fy|honger}}, {{l|fy|hûnger}}
104
+* Old Saxon: {{l|osx|hungar}}
105
+** Middle Low German: {{l|gml|hunger}}
106
+*** Low German: {{l|nds-de|Hunger}}
107
+* Old Dutch: {{l|odt|*hungar}}, {{l|odt|hunger}}
108
+** Middle Dutch: {{l|dum|hunger}}, {{l|dum|honger}}
109
+*** Dutch: {{l|nl|honger}}
110
+* Old High German: {{l|goh|hungar}}
111
+** Middle High German: {{l|gmh|hunger}}
112
+*** German: {{l|de|Hunger}}
113
+*** Luxembourgish: {{l|lb|Honger}}
114
+*** Yiddish: {{l|yi|הונגער}}
115
+* Old Norse: {{l|non|hungr}}
116
+** Icelandic: {{l|is|hungur}}
117
+** Norwegian: {{l|no|hunger}}
118
+** Old Swedish: {{l|gmq-osw|hunger}}
119
+*** Swedish: {{l|sv|hunger}}
120
+** Danish: {{l|da|hunger}}
121
+* Gothic: *hunhrus &gt; {{l|got|𐌷𐌿𐌷𐍂𐌿𐍃|tr=hūhrus}}, *huggrus (in derivatives)</text>
122
+      <sha1>67s06inul2cv24rnf7g5vsvzun07ati</sha1>
123
+    </revision>
124
+  </page>
125
+</mediawiki>
0 126
new file mode 100644
... ...
@@ -0,0 +1,51 @@
1
+__author__ = 'Ray'
2
+
3
+import sys
4
+import io
5
+import re
6
+import urllib.request
7
+import urllib.parse
8
+import xml.etree.ElementTree as ET
9
+
10
+
11
+def get_descendants(title):
12
+    """Get and parse the descendant section from en.wiktionary.org for a specific title (usually word)"""
13
+    request_parameters = {
14
+        "action": "query",
15
+        "format": "xml",
16
+        "export": 1,
17
+        "exportnowrap": 1,
18
+        "titles": title,
19
+        "redirects": 1
20
+    }
21
+    request_data = urllib.parse.urlencode(request_parameters)
22
+    request_data = request_data.encode('utf8')
23
+    wikipage_request = urllib.request.Request("https://en.wiktionary.org/w/api.php/", request_data)
24
+    with urllib.request.urlopen(wikipage_request) as wikipage_response:
25
+        # with open("Wiktionary-20170813141826.xml", "r", encoding="utf-8") as wikipage_response:
26
+        wikipage_string = wikipage_response.read()
27
+        root = ET.fromstring(wikipage_string)
28
+        ns = {"mwns": "http://www.mediawiki.org/xml/export-0.10/"}
29
+        # wikipage_text_string = root.find("./mwns:page/mwns:title", ns).text
30
+        wikipage_text_string = root.find("./mwns:page/mwns:revision/mwns:text", ns).text
31
+        # ET.dump(root)
32
+        # print(wikipage_text_string)
33
+        match = re.search(r"\=+Descendants\=+", wikipage_text_string)
34
+        descendants_string = wikipage_text_string[match.end():-1]
35
+        descendant_words = []
36
+        # {{l|ang|hungor}} start_tag = "{{" end_tag = "}}" parameter_tag = "|"
37
+        descendants_buffer = io.StringIO(descendants_string)
38
+        for line in descendants_buffer:
39
+            matches = re.findall(r"(?<={{).*?(?=}})",
40
+                                 line)  # find all matches for every string between {{ and }} in a non-greedy manner
41
+            for match in matches:
42
+                match_split = match.split("|")
43
+                if (match_split[0] == "l"):
44
+                    descendant_words.append({"language": match_split[1], "lemma": match_split[2]})
45
+        return descendant_words
46
+
47
+
48
+def get_PG_decendants(word):
49
+    """Get and parse the descendant section from en.wiktionary.org for a specific gem-pro word"""
50
+    pg_prefix = "Reconstruction:Proto-Germanic/"
51
+    return get_descendants(pg_prefix + word)
0 52
new file mode 100644
... ...
@@ -0,0 +1,81 @@
1
+# encoding: utf-8
2
+
3
+import sys
4
+from difflib import SequenceMatcher
5
+import io
6
+import re
7
+import argparse
8
+import urllib.request
9
+import urllib.parse
10
+import xml.etree.ElementTree as ET
11
+from descendants import get_PG_decendants, get_descendants
12
+
13
+globalvars = {
14
+    "escape": False,
15
+    "data": {
16
+        "words": [
17
+            {"id": 0, "language": "gem", "lemma": "bītaną"},
18
+            {"id": 1, "language": "gem", "lemma": "hrītaną"},
19
+            {"id": 2, "language": "deu", "lemma": "beißen"},
20
+            {"id": 3, "language": "deu", "lemma": "reißen"},
21
+            {"id": 4, "language": "nld", "lemma": "bijten"},
22
+            {"id": 5, "language": "nld", "lemma": "rijten"},
23
+            {"id": 6, "language": "deu", "lemma": "scheißen"},
24
+            {"id": 7, "language": "gem", "lemma": "skītaną"},
25
+            {"id": 8, "language": "nld", "lemma": "schijten"},
26
+        ],
27
+        "origins": [
28
+            {"from": 0, "to": 2},
29
+            {"from": 1, "to": 3},
30
+            {"from": 0, "to": 4},
31
+            {"from": 1, "to": 5},
32
+            {"from": 7, "to": 6},
33
+            {"from": 7, "to": 8},
34
+        ]
35
+    }
36
+}
37
+
38
+
39
+def myformat(string):
40
+    global globalvars
41
+    return (string.encode("unicode_escape") if globalvars["escape"] else string)
42
+
43
+
44
+def compare(wordpair):
45
+    """Compare two words of the same language and find the biggest matching block.
46
+        e.g. PG bītaną and hrītaną share "ītaną"
47
+    """
48
+    analyzation = SequenceMatcher(None, wordpair[0]["lemma"], wordpair[1]["lemma"])
49
+    print("\ncomparing '%s' and '%s' ..." % (myformat(wordpair[0]["lemma"]), myformat(wordpair[1]["lemma"])))
50
+    blocks = analyzation.get_matching_blocks()
51
+    for block in blocks:
52
+        print("\tcurrent block: %s" % (str(block)))
53
+        for index in range(len(wordpair)):
54
+            word = wordpair[index]
55
+            size = block[2]
56
+            if (size > 0):
57
+                begin = block[index]
58
+                end = begin+size
59
+                print("\t\t'%s'[%u:%u] = %s" % (myformat(word["lemma"]), begin, end, myformat(word["lemma"][begin:end])))
60
+
61
+
62
+def main(args):
63
+    global globalvars
64
+    parser = argparse.ArgumentParser()
65
+    parser.add_argument("--escape", dest="escape", action="store_const", const=sum, default=False, help="enable escape")
66
+    parameters = parser.parse_args(args)
67
+    globalvars["escape"] = parameters.escape
68
+    if (len(args) > 0):
69
+        escape = False
70
+    for i in range(0, len(globalvars["data"]["words"])):
71
+        word1 = globalvars["data"]["words"][i]
72
+        for j in range(i+1, len(globalvars["data"]["words"])):
73
+            word2 = globalvars["data"]["words"][j]
74
+            if (word1["language"] == word2["language"]): #if languages match
75
+                wordpair = (word1, word2)
76
+                compare(wordpair) #compare the pair
77
+    result = list(map(lambda word: {"language": word["language"], "lemma": myformat(word["lemma"])}, get_PG_decendants("hrītaną")))
78
+    print(str(result))
79
+                
80
+main(sys.argv[1:])
81
+