# Demo entry 6687852

1

Submitted by 1 on Dec 31, 2017 at 04:48
Language: Python 3. Code size: 5.0 kB.

```import re
import sys

def get(s):
text = s.split(" ")  # 将输入符号以空格为分隔符进行切片

num = len(text)

dp = [{} for i in range(0, num)]  # 初始化dp和pre
pre = [{} for i in range(0, num)]

for i in range(num):  # 清空dp矩阵和pre矩阵
for j in pos:
dp[i][j] = 0
pre[i][j] = ""

for c in pos:  # 进行初始化
if (B[c].__contains__(text[0])):
dp[0][c] = pi[c] * B[c][text[0]] * 1000
else:
dp[0][c] = pi[c] * 0.5 * 1000 / (dy[c] + cnt[c])

for i in range(1, num):  # 进行计算
for j in pos:
t = 0  # t为从状态j观测到text[i]的发射概率
if (B[j].__contains__(text[i])):
t = B[j][text[i]] * 1000
else:
t = 0.5 * 1000 / (dy[j] + cnt[j])
for k in pos:
if (dp[i][j] < dp[i - 1][k] * A[k][j] * t):  # 使用dp[i-1][k]更新dp[i][j]
dp[i][j] = dp[i - 1][k] * A[k][j] * t
pre[i][j] = k
ans = {}  # 存词性分析结果
MAX = ""  # 得到最大值
for i in pos:
if (MAX == "" or dp[num - 1][i] > dp[num - 1][MAX]):
MAX = i

for i in range(num - 1, -1, -1):  # 回溯得到分析结果,将词性存于ans中
ans[i] = MAX
MAX = pre[i][MAX]
return ans

# 所有词语
words = []
# 所有的词性
pos = []
# 每个词性出现的频率
cnt = {}
# 初始状态概率分布矩阵
pi = {}
# 状态转移概率矩阵
A = {}
# 符号发射概率矩阵
B = {}

#训练过程
print("训练中......")
file = open("语料.txt",'r')
while (s!=""):
tmp = s.split(" ")   #将字符串以空格为分割符进行切片
n = len(tmp)             #n为得到的词数(包括词性)
for i in range(n):
word = tmp[i].split('/') #将词和词性分开
if(len(word)>1):       #有效词
#print(word[0])
if(word[0] not in words):
words.append(word[0])
if (word[1] not in pos):
pos.append(word[1])

for i in pos:                   #初始化概率矩阵
pi[i]=0
cnt[i]=0
A[i]={}
B[i]={}
for j in pos:
A[i][j]=0
for j in words:
B[i][j]=0

#计算概率矩阵
line=0  #总行数
file=open("语料.txt",'r')
while(s!=""):
if(s=="\n"):                        #为空行则跳过
continue
tmp=s.split(" ")
n=len(tmp)
line+=1       #行数加1
flag=True                           #标记是否遇到句子第一个词
for i in range(n):
word=tmp[i].split('/')          #将词和词性分开
pre=tmp[i-1].split('/')         #将上一个词的词和词性分开
if(len(word)>1):                #有效词
cnt[word[1]]+=1             #词性word[1]出现次数+1
B[word[1]][word[0]] += 1    #更新符号发射概率矩阵
if(flag):                   #当这个词为句子的第一个词时
pi[word[1]]+=1
flag=False
elif(i > 0 and len(pre)>1): #更新状态转移概率矩阵
A[pre[1]][word[1]]+=1

dx={}
dy={}
for i in pos:
dx[i]=0
dy[i]=0
pi[i]=pi[i]*1.0/line                #求出初始状态概率分布矩阵
#数据平滑
for j in pos:
if(A[i][j]==0):
dx[i]+=1
A[i][j]=0.5
for j in words:
if(B[i][j]==0):
dy[i]+=1
B[i][j]=0.5
#计算状态转移概率矩阵和符号发射概率矩阵
for i in pos:
for j in pos:
A[i][j]=A[i][j]*1.0/(cnt[i]+dx[i])
for j in words:
B[i][j]=B[i][j]*1.0/(cnt[i]+dy[i])

print ("训练结束")

while(True):
op=eval(input("输入以下命令进行操作:\n1.对已经进行标注的文本进行词性标注并比对答案\n2.输入句子进行词性标注\n3.退出\n"))
if(op==1):
file_name=input("请输入文件名称:")
file=open(file_name,'r')
print("测试中......")
tot=0             #测试数据中词的总个数
right_num=0       #正确的个数
while(s!=""):
#print(s)
str = ""    #str为不含词性标注的以空格分隔的句子
ans1 = {}   #存正确的分词结果
word_cnt = 0 #词的个数
tmp = s.split(" ")  # 将字符串以空格为分割符进行切片
n = len(tmp)        # n为得到的词数(包括词性)
for i in range(n):
word = tmp[i].split('/')  # 将词和词性分开
if (len(word) > 1):  # 有效词
if(str==""):
str+=word[0]
else:
str+=" "+word[0]
ans1[word_cnt]=word[1]
word_cnt+=1
tot+=word_cnt
if(str!=""):
ans=get(str)
for i in range(word_cnt):   #词性标注结果与正确结果进行比对
if(ans[i]==ans1[i]):
right_num+=1

res=1.0*right_num/tot           #计算准确率
print("准确率为:",res)

elif(op==2):
s = input("请输入需要词性标注的句子，以空格分割: ")

ans = get(s)
text = s.split(" ")
num = len(ans)
for i in range(0, num):                     #输出结果
print(text[i] +"/" +ans[i] , end=" ")
print("")

elif(op==3):
break
else:
print("非法输入！")
```

This snippet took 0.01 seconds to highlight.

Back to the Entry List or Home.