句法分析分为两大类Dependency Parsing和Constituency Parser。中文Dependency Parsing对应的工具比较多,例如哈工大的ltp,Stanford CoreNLP等。但是Constituency Parser对应的工具比较少,经过调研,Stanford CoreNLP应该是最好的选择。
本文主要介绍如何使用stanza来进行Constituency Parse。 stanza是Stanford CoreNLP的最新python包,官网链接。
获取句子constitutency parser的结果
from stanza.server import CoreNLPClient class StanzaClient(): def __init__(self): self.client = CoreNLPClient(annotators=[ 'tokenize', 'ssplit', 'pos', 'lemma', 'parse', ], timeout=30000, properties="zh", output_format="json", memory='5g') def get_parse_tree(self, sent): ann = self.client.annotate(sent) return ann["sentences"][0]["parse"]For Example, "今天天气不错啊"的parser tree 如下:
(ROOT (CP (IP (NP (NT 今天)) (NP (NN 天气)) (VP (VA 不错))) (SP 啊)))组词:输入一句话,期望获得不同的组词方案 举例:今天天气不错啊=>今天天气/不错啊, 今天/天气/不错/啊,今天/天气不错啊 方法:使用nltk.tree.Tree对parser的结果进行处理,组词
import random from nltk.tree import Tree class Stack(): #定义类 def __init__(self): #产生一个空的容器 self.__list = [] def put(self, item): #入栈 self.__list.append(item) def get(self): #出栈 return self.__list.pop() def speek(self): #返回栈顶元素 return self.__list[-1] def empty(self): #判断是否已为空 return not self.__list def size(self): #返回栈中元素个数 return len(self.__list) class ConParserSeg(): def __init__(self, height2prob={ 4: 0.7, 5: 0.7, 6: 0.5, 7: 0.5, 8: 0.3, 9: 0.3 }): self.height2prob = height2prob def sent_cut(self, sent): t = Tree.fromstring(sent) q = Stack() q.put(t[0]) res = [] while not q.empty(): item = q.get() if isinstance(item, str): res.append(item) continue if item.height() == 2: res.append("".join(item.leaves())) continue for i in range(len(item)): prob = random.random() h = item[i].height() if h in self.height2prob and prob <= self.height2prob[h]: q.put("".join(item[i].leaves())) else: q.put(item[i]) res.reverse() return res if __name__ == "__main__": s = """ (ROOT (IP (VP (PP (P 在) (NP (NR 日本))) (VP (VV 发生) (AS 了) (NP (DNP (NP (QP (CD 一) (CLP (M 件))) (NP (NN 千真万确))) (DEG 的)) (NP (NN 事))))))) """ s = "(ROOT\n (IP\n (CP\n (IP\n (VP\n (VRD (VV 吃饭) (VV 去))))\n (SP 吧))\n (IP\n (NP (NN 太阳))\n (VP\n (PP (P 从)\n (NP (NN 东边)))\n (VP (VV 升起))))))" seg = ConParserSeg() ret = seg.sent_cut(s) print(" ".join(ret))