import sys, re, collections, docx from nltk.parse.stanford import StanfordDependencyParser, StanfordNeuralDependencyParser from nltk.corpus import wordnet as wn # removing underscores from the concept names to be consistent with Scone def remove_underscores(word): words = word.split("_") return " ".join(words) def discover_sense_for_name(def_name, synset_name): original_synset = wn.synset(synset_name) possible_synsets = wn.synsets(def_name, pos='n') if len(possible_synsets) == 0: # if this word is not in WordNet, it cannot be associated with a sense return '' if len(possible_synsets) == 1: return possible_synsets[0].name() hyp_rel = lambda s:s.hypernyms() path_to_synset = list(original_synset.closure(hyp)) potential_hypernyms = [s for s in possible_synsets if s in path_to_synset] if len(potential_hypernyms) == 1: return potential_hypernyms[0].name() # elif len(potential_hypernyms) > 1: # return '' else: # the noun does not occur in the hypernym list -> try to find the shortest # path between all the possible senses # TODO: implement; for now, return '' if none of the senses (or more than one) # are present in the potential list return '' scone_import_file = sys.argv[1] scone_file = open(scone_import_file, 'a') path_to_jar = '../11-411/NLP/stanford-parser/stanford-parser.jar' path_to_models_jar = '../11-411/NLP/stanford-parser/stanford-parser-3.5.2-models.jar' dependency_parser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar = path_to_models_jar) corenlp_path_to_jar = 'stanford-corenlp/stanford-corenlp.jar' corenlp_path_to_models_jar = 'stanford-corenlp/stanford-corenlp-3.5.2-models.jar' neural_parser = StanfordNeuralDependencyParser(path_to_jar=corenlp_path_to_jar, path_to_models_jar = corenlp_path_to_models_jar) # "entity" is the highest node out of all WordNet nouns root = wn.synset('entity.n.01') # DFS through the WordNet hyper/hyponym graph visited = set() visited.add(root) scone_file.write("(new-wordnet-type {" + root.name() + "} {thing})\n") typ, inst = 't', 'i' stack = [(r, root, typ) for r in root.hyponyms()] # holonyms will be added at the end # each list element will be (x, y, t) where x is a holonym of y and t is either # a part, substance, or member type part, substance, member = 'p', 's', 'm' holonyms = [] intersection_types = [] appos_types = [] parent = root while len(stack) > 0: current, parent, tp = stack.pop() if (current, parent, tp) not in visited: visited.add((current, parent, tp)) if tp == typ: scone_file.write("(new-wordnet-type {") else: scone_file.write("(new-wordnet-indv {") scone_file.write(current.name() + "} {" + parent.name() + "})\n") scone_file.write("(english {" + current.name() + "}") for lemma in current.lemmas(): scone_file.write(" \"" + remove_underscores(lemma.name()) + "\"") scone_file.write(")\n") for instance in current.instance_hyponyms(): stack.append((instance, current, inst)) for hyp in current.hyponyms(): stack.append((hyp, current, typ)) for h in current.part_holonyms(): holonyms.append((h, current, part)) for h in current.substance_holonyms(): holonyms.append((h, current, substance)) for h in current.member_holonyms(): holonyms.append((h, current, member)) # if the definition is composed of several "sentences", take the first one # for now defn = current.definition() split_by_semi = defn.split("; ") if len(split_by_semi) > 1: defn = split_by_semi[0] # parse the definition parse = dependency_parser.raw_parse(defn).next() nparse = neural_parser.raw_parse(defn).next() definition_triples = list(parse.triples()) ndefinition_triples = list(nparse.triples()) # the first pair in the dependency triples is the top of the tree; ~ matches # the actual hypernym, so use relations with that word to add information # to Scone defn_hypernym, defn_hypernym_pos = definition_triples[0][0] ndefn_hypernym, ndefn_hypernym_pos = ndefinition_triples[0][0] # neural network-based & regular parser results don't match parse_tree = None if defn_hypernym == ndefn_hypernym and defn_hypernym == parent.name(): parse_tree = nparse elif defn_hypernym == parent.name(): parse_tree = parse elif ndefn_hypernym == parent.name(): parse_tree = nparse elif defn_hypernym_pos[0] == 'N' and ndefn_hypernym_pos[0] == 'N': continue elif defn_hypernym_pos[0] == 'N': parse_tree = parse elif ndefn_hypernym_pos[0] == 'N': parse_tree = nparse else: continue name = parse_tree.root['word'] if 'compound' in parse_tree.root['deps']: # for now, ignore if there are several compounds compound = parse_tree.root['deps']['compound'][0] name = parse_tree.nodes[compound]['word'] + "_" + name # find the modifier relations and create an intersection type if 'amod' in parse_tree.root['deps']: adjectives = parse_tree.root['deps']['amod'] adjectives = [idx for idx in adjectives if parse_tree.nodes[idx]['ctag'] != 'JJR' \ and parse_tree.nodes[idx]['ctag'] != 'JJS'] new_types = [] for idx in adjectives: # adjectives have to be {thing}s scone_file.write("(new-wordnet-type {" + parse_tree.nodes[idx]['word'] + "} {thing})\n") new_types.append("{" + parse_tree.nodes[idx]['word'] + "}") # once the entire wordnet is added, attempt to find the most reasonable # sense for the main definition noun, otherwise assign to Scone knowledge intersection_types.append((current.name, name, new_types)) if 'appos' in parse_tree.root['deps']: equivalents = parse_tree.root['appos'] equivalents_words = [] for idx in equivalents: w = parse_tree.nodes[idx]['word'] if 'compound' in parse_tree.nodes[idx]['deps']: # for now, ignore if there are several compounds compound = parse_tree.nodes[idx]['deps']['compound'][0] w = parse_tree.nodes[compound]['word'] + "_" + w equivalents_words.append(w) appos_types.append((current.name(), equivalents_words)) # for ((p, ppos), rel, (c, cpos)) in definition_triples: # if rel == 'conj' and (p, ppos) in adj: # print current.name(), p, "conj", c # if rel == 'nmod' and ((p == defn_hypernym and ppos == defn_hypernym_pos) \ # or (c == defn_hypernym and cpos == defn_hypernym_pos)): # print current.name(), ":", p, " nmod ", c # if rel == 'advmod' and ((p == defn_hypernym and ppos == defn_hypernym_pos) \ # or (c == defn_hypernym and cpos == defn_hypernym_pos)): # print current.name(), ":", p, " advmod ", c # if rel == 'acl' and ((p == defn_hypernym and ppos == defn_hypernym_pos) \ # or (c == defn_hypernym and cpos == defn_hypernym_pos)): # print current.name(), ":", p, " acl ", c # if rel == 'case' and ((p == defn_hypernym and ppos == defn_hypernym_pos) \ # or (c == defn_hypernym and cpos == defn_hypernym_pos)): # print current.name(), ":", p, " case ", c # if rel == 'appos' and ((p == defn_hypernym and ppos == defn_hypernym_pos) \ # or (c == defn_hypernym and cpos == defn_hypernym_pos)): # print current.name(), ":", p, " appos ", c for (x, y, t) in holonyms: relation = '' if t == part: relation = '{part of}' elif t == member: relation = '{member of}' else: relation = '{makes up}' scone_file.write("(new-statement {" + y.name() + "} " + relation + " {" + x.name() + "})\n") for (synset_name, def_name, adjectives) in intersection_types: wn_synset = discover_sense_for_name(def_name, synset_name) if wn_synset == '': scone_file.write("(new-wordnet-type {" + def_name + "} {thing})\n") # begin intersection type scone_file.write("(new-is-a {" + synset_name + "} ") scone_file.write("(new-intersection-type nil '(") for t in adjectives: scone_file.write(t + " ") scone_file.write("{" + (wn_synset if wn_synset != '' else def_name) + "})))\n") for (synset_name, appos) in appos_types: for a in appos_types: wn_synset = discover_sense_for_name(def_name, synset_name) if wn_synset == '': scone_file.write("(new-wordnet-type {" + def_name + "} {thing})\n") scone_file.write("(new-is-eq {" + synset_name + "} {" + \ (wn_synset if wn_synset != '' else def_name) + "})\n") scone_file.close()