import sys, re, collections, docx from nltk.stem.lancaster import LancasterStemmer def import_us_and_uk_spelling(): file = open('Words-Worldwide-Word-list-UK-US-2009.docx', 'r') document = docx.Document(file) dictionary = dict() for p in document.paragraphs: # Match the possible nine spellings match = re.match('([a-z]*\t|\t)([a-z]*\t|\t)([a-z]*\t|\t)([a-z]*\t|\t)' \ '([a-z]*\t|\t)([a-z]*\t|\t)([a-z]*\t?|\t)([a-z]*\t?|\t)' \ '([a-z]*\t?|\t)', p.text) if match: uk_one, uk_two, uk_three, uk_four, us_uk, us_one, us_two, us_three, us_four = \ match.group(1).split(), match.group(2).split(), match.group(3).split(), \ match.group(4).split(), match.group(5).split(), match.group(6).split(), \ match.group(7).split(), match.group(8).split(), match.group(9).split() variants = [us_one, uk_one, uk_two, uk_three, uk_four, us_uk, us_two, us_three, us_four] # Remove the empty ones variants = map(lambda x: '' if len(x) == 0 else str(x[0]), variants) variants = filter(lambda x: len(x) > 0, variants) for v in variants: dictionary[v] = variants file.close() return dictionary # removing underscores from the concept names to be consistent with Scone def remove_underscores(word): words = word.split("_") return " ".join(words) def add_english_names(concept, dictionary, concepts_file): concepts_file.write("(add-english-names (list") for c in dictionary[concept]: concepts_file.write(" {" + c + "}") concepts_file.write(") (list") for c in dictionary[concept]: concepts_file.write(" \"" + c + "\"") concepts_file.write("))\n") # main function to run to import the concepts # use: python import_knext_lexical_concepts.py conceptsfile outputfile def import_knext_lexical_concepts(): concepts_file = sys.argv[1] output_file = sys.argv[2] concepts = open(concepts_file, 'r') lisp_concepts = open(output_file, 'a') mistakes = open("mistakes.txt", 'w') stemmer = LancasterStemmer() is_not_a = False group_1_2_4 = False # branches of study are collected separately as a dictionary and only after # all of them have been inserted they are created as splits branches_of_study = collections.defaultdict(list) us_uk_dictionary = import_us_and_uk_spelling() lisp_concepts.write("(in-namespace \"common\")\n") lisp_concepts.write("(new-relation {consists of})\n") lisp_concepts.write("(new-knext-split {thing} '({countable} {uncountable}))\n") for line in concepts: # recognizing the different cases # (all x ((x phi.n) <=> (x psi.n))) = eq link # Not one of the cases listed in the original paper. # Some of the cases in the data don't make sense as equalities # (e.g. physical process and process). match = re.search('\(all x \(\(x ([a-z_]*)[1-9]?.n\) <=> \(x ([a-z_]*)[0-9]?.n\)\)\)', line) if match: phi, psi = remove_underscores(match.group(1)), remove_underscores(match.group(2)) if (psi in phi or phi in psi) and stemmer.stem(psi) != stemmer.stem(phi): # If one of the concepts is a substring, it's probably an error # however, make sure to not include same-versions of the word # (e.g. touching and touch) mistakes.write(line) # if the two concepts are alternate spellings of each other, create the # concept with the "American" spelling and add the other spellings on elif psi in us_uk_dictionary and phi in us_uk_dictionary and \ len(set(us_uk_dictionary[psi]).intersection(us_uk_dictionary[phi])) \ == len(us_uk_dictionary[psi]): add_english_names(psi, us_uk_dictionary, lisp_concepts) else: if phi in us_uk_dictionary and phi != us_uk_dictionary[phi][0]: add_english_names(phi, us_uk_dictionary, lisp_concepts) phi = us_uk_dictionary[phi][0] if psi in us_uk_dictionary and psi != us_uk_dictionary[psi][0]: add_english_names(psi, us_uk_dictionary, lisp_concepts) psi = us_uk_dictionary[psi][0] lisp_concepts.write("(new-knext-eq '({" + phi + "} {thing}) ") lisp_concepts.write("'({" + psi + "} {thing}))\n") continue # (all x (x phi.n) (not (x psi.n))) = is-not-a link # Not one of the cases listed in the original paper. match = re.search('\(all x \(x ([a-z_]*)[1-9]?.n\) \(not \(x ([a-z_]*)[0-9]?.n\)\)\)', line) if match: is_not_a = True phi, psi = remove_underscores(match.group(1)), remove_underscores(match.group(2)) if phi in us_uk_dictionary and phi != us_uk_dictionary[phi][0]: add_english_names(phi, us_uk_dictionary, lisp_concepts) phi = us_uk_dictionary[phi][0] if psi in us_uk_dictionary and psi != us_uk_dictionary[psi][0]: add_english_names(psi, us_uk_dictionary, lisp_concepts) psi = us_uk_dictionary[psi][0] continue if is_not_a: match_amount_of = re.search('\"No( amount of | )' + phi + ' is an?' \ '( amount of | )' + psi + '\.\"', line) if match_amount_of: phi_amount_of = match_amount_of.group(1) psi_amount_of = match_amount_of.group(2) phi_class = "{countable}" if len(phi_amount_of) == 1 else "{uncountable}" psi_class = "{countable}" if len(psi_amount_of) == 1 else "{uncountable}" lisp_concepts.write("(new-knext-type {" + phi + "} " + phi_class + ")\n") lisp_concepts.write("(new-knext-type {" + psi + "} " + psi_class + ")\n") lisp_concepts.write("(new-is-not-a {" + phi + "} {" + psi + "})\n") is_not_a = False continue # group 1: (all x: [x phi.n] [x psi.n]) = is-a link # Every phi is a psi. # group 2: (all x: [x phi.n] [x psi.n]) (mass terms or lexical plurals) # Every amount of phi is an amount of psi. # group 4: (all x: [x phi.n] [x psi.n]) (count & plural or mass) # Every phi is an amount of psi. # (all x: [x phi.n] [x psi.n]) (mass / plural & singular) # Every amount of phi is a psi. match = re.search('\(all x \(x ([a-z_]*)[1-9]?.n\) \(x ([a-z_]*)[0-9]?.n\)\)', line) if match: group_1_2_4 = True phi, psi = remove_underscores(match.group(1)), remove_underscores(match.group(2)) if phi in us_uk_dictionary and phi != us_uk_dictionary[phi][0]: add_english_names(phi, us_uk_dictionary, lisp_concepts) phi = us_uk_dictionary[phi][0] if psi in us_uk_dictionary and psi != us_uk_dictionary[psi][0]: add_english_names(psi, us_uk_dictionary, lisp_concepts) psi = us_uk_dictionary[psi][0] continue if group_1_2_4: match_amount_of = re.search('\"Every( amount of | )' + phi + ' is an?' \ '( amount of | )' + psi + '\.\"', line) if match_amount_of: phi_amount_of = match_amount_of.group(1) psi_amount_of = match_amount_of.group(2) phi_class = "{countable}" if len(phi_amount_of) == 1 else "{uncountable}" psi_class = "{countable}" if len(psi_amount_of) == 1 else "{uncountable}" lisp_concepts.write("(new-knext-is-a '({" + phi + "} " + phi_class + ") ") lisp_concepts.write("'({" + psi + "} " + psi_class + "))\n") group_1_2_4 = False continue # group 3: # (gamma.name psi.n) = is-a link between an individual and a type # (The) gamma is/are a psi. match = re.search('\(([a-z_]*)[1-9]?.name ([a-z_]*)[0-9]?.n\)', line) if match: gamma, psi = remove_underscores(match.group(1)), remove_underscores(match.group(2)) if gamma in us_uk_dictionary and gamma != us_uk_dictionary[gamma][0]: add_english_names(gamma, us_uk_dictionary, lisp_concepts) gamma = us_uk_dictionary[gamma][0] if psi in us_uk_dictionary and psi != us_uk_dictionary[psi][0]: add_english_names(psi, us_uk_dictionary, lisp_concepts) psi = us_uk_dictionary[psi][0] lisp_concepts.write("(new-knext-type {" + psi + "} {countable})\n") lisp_concepts.write("(new-knext-indv {" + gamma + "} {" + psi + "})\n") continue # [(k phi.n) psi.n] (mass & count) # (The) phi is/are a psi. match = re.search('\(\(k ([a-z_]*)[1-9]?.n\) ([a-z_]*)[0-9]?.n\)', line) if match: phi, psi = remove_underscores(match.group(1)), remove_underscores(match.group(2)) if phi in us_uk_dictionary and phi != us_uk_dictionary[phi][0]: add_english_names(phi, us_uk_dictionary, lisp_concepts) phi = us_uk_dictionary[phi][0] if psi in us_uk_dictionary and psi != us_uk_dictionary[psi][0]: add_english_names(psi, us_uk_dictionary, lisp_concepts) psi = us_uk_dictionary[psi][0] lisp_concepts.write("(new-knext-is-a '({" + phi + "} {uncountable}) ") lisp_concepts.write("'({" + psi + "} {countable}))\n") continue # (all x: [x phi.n] [x item-of.n (k psi.n)]) (count & atomic ensemble) # Every phi is an item of psi. match = re.search('\(all x \(x ([a-z_]*)[1-9]?.n\) ' \ '\(x item-of.n \(k ([a-z_]*)[0-9]?.n\)\)\)', line) if match: phi, psi = remove_underscores(match.group(1)), remove_underscores(match.group(2)) if phi in us_uk_dictionary and phi != us_uk_dictionary[phi][0]: add_english_names(phi, us_uk_dictionary, lisp_concepts) phi = us_uk_dictionary[phi][0] if psi in us_uk_dictionary and psi != us_uk_dictionary[psi][0]: add_english_names(psi, us_uk_dictionary, lisp_concepts) psi = us_uk_dictionary[psi][0] lisp_concepts.write("(new-knext-type {" + phi + "} {countable})\n") lisp_concepts.write("(new-knext-type {" + psi + "} {countable})\n") lisp_concepts.write("(new-statement {" + psi + "} {consists of} {" + phi + "})\n") continue # [(k phi.n) branch-of.n (k psi.n)] (branches of study) # (The) phi is/are a branch of (the) psi. # Some cases need to be filtered out, such as when x is a branch of y but also # y is a branch of x. match = re.search('\(\(k ([ a-z_]*)[1-9]?.n\) branch-of.n \(k ([a-z_]*)[0-9]?.n\)\)', line) if match: phi, psi = remove_underscores(match.group(1)), remove_underscores(match.group(2)) if phi in us_uk_dictionary and phi != us_uk_dictionary[phi][0]: add_english_names(phi, us_uk_dictionary, lisp_concepts) phi = us_uk_dictionary[phi][0] if psi in us_uk_dictionary and psi != us_uk_dictionary[psi][0]: add_english_names(psi, us_uk_dictionary, lisp_concepts) psi = us_uk_dictionary[psi][0] branches_of_study[psi].append(phi) # group 4 # (all x: [x phi.n] [x (plur psi.n)]) (mass & singular) # Every amount of phi is an amount of psis. match = re.search('\(all x \(x ([a-z_]*)[1-9]?.n\) ' \ '\(x \(plur ([a-z_]*)[0-9]?.n\)\)\)', line) if match: phi, psi = remove_underscores(match.group(1)), remove_underscores(match.group(2)) if phi in us_uk_dictionary and phi != us_uk_dictionary[phi][0]: add_english_names(phi, us_uk_dictionary, lisp_concepts) phi = us_uk_dictionary[phi][0] if psi in us_uk_dictionary and psi != us_uk_dictionary[psi][0]: add_english_names(psi, us_uk_dictionary, lisp_concepts) psi = us_uk_dictionary[psi][0] lisp_concepts.write("(new-knext-type {" + phi + "} {uncountable})\n") lisp_concepts.write("(new-knext-type {" + psi + "} {uncountable})\n") lisp_concepts.write("(new-statement {" + phi + "} {consists of} {" + psi + "})\n") continue # group 5: is-a links; all nouns are countable # (all x: [x (plur phi-c.n)] [x (plur psi-c.n)]) (events, both senses) # Every phi is a psi. match = re.search('\(all x \(x \(plur ([a-z_]*)[1-9]?-c.n\)\) ' \ '\(x \(plur ([a-z_]*)[0-9]?-c.n\)\)\)', line) if match: phi, psi = remove_underscores(match.group(1)), remove_underscores(match.group(2)) if phi in us_uk_dictionary and phi != us_uk_dictionary[phi][0]: add_english_names(phi, us_uk_dictionary, lisp_concepts) phi = us_uk_dictionary[phi][0] if psi in us_uk_dictionary and psi != us_uk_dictionary[psi][0]: add_english_names(psi, us_uk_dictionary, lisp_concepts) psi = us_uk_dictionary[psi][0] lisp_concepts.write("(new-knext-is-a '({" + phi + "} {countable}) ") lisp_concepts.write("'({" + psi + "} {countable}))\n") continue # (all x: [x (plur phi-c.n)] [x psi.n]) (events, ambiguous & count) # Every phi is a psi. match = re.search('\(all x \(x \(plur ([a-z_]*)[0-9]?-c.n\)\) ' \ '\(x ([a-z_]*)[1-9]?.n\)\)', line) if match: phi, psi = remove_underscores(match.group(1)), remove_underscores(match.group(2)) if phi in us_uk_dictionary and phi != us_uk_dictionary[phi][0]: add_english_names(phi, us_uk_dictionary, lisp_concepts) phi = us_uk_dictionary[phi][0] if psi in us_uk_dictionary and psi != us_uk_dictionary[psi][0]: add_english_names(psi, us_uk_dictionary, lisp_concepts) psi = us_uk_dictionary[psi][0] lisp_concepts.write("(new-knext-is-a '({" + phi + "} {countable}) ") lisp_concepts.write("'({" + psi + "} {countable}))\n") continue # (all x: [x phi.n] [x (plur psi-c.n)]) (events, count & ambiguous) # Every phi is a psi. match = re.search('\(all x \(x ([a-z_]*)[1-9]?.n\) ' \ '\(x \(plur ([a-z_]*)[0-9]?-c.n\)\)\)', line) if match: phi, psi = remove_underscores(match.group(1)), remove_underscores(match.group(2)) if phi in us_uk_dictionary and phi != us_uk_dictionary[phi][0]: add_english_names(phi, us_uk_dictionary, lisp_concepts) phi = us_uk_dictionary[phi][0] if psi in us_uk_dictionary and psi != us_uk_dictionary[psi][0]: add_english_names(psi, us_uk_dictionary, lisp_concepts) psi = us_uk_dictionary[psi][0] lisp_concepts.write("(new-knext-is-a '({" + phi + "} {countable}) ") lisp_concepts.write("'({" + psi + "} {countable}))\n") continue # After all branches of study have been collected, convert them into Scone # splits. for k, v in branches_of_study.iteritems(): if len(v) > 0: lisp_concepts.write("(new-knext-split {" + k + "} '( ") for b in v: lisp_concepts.write("{" + b + "} ") lisp_concepts.write("))\n") concepts.close() lisp_concepts.close() mistakes.close() import_knext_lexical_concepts()