Demo entry 6687852

1

   

Submitted by 1 on Dec 31, 2017 at 04:48
Language: Python 3. Code size: 5.0 kB.

import re
import sys

def get(s):
    text = s.split(" ")  # 将输入符号以空格为分隔符进行切片

    num = len(text)

    dp = [{} for i in range(0, num)]  # 初始化dp和pre
    pre = [{} for i in range(0, num)]

    for i in range(num):  # 清空dp矩阵和pre矩阵
        for j in pos:
            dp[i][j] = 0
            pre[i][j] = ""

    for c in pos:  # 进行初始化
        if (B[c].__contains__(text[0])):
            dp[0][c] = pi[c] * B[c][text[0]] * 1000
        else:
            dp[0][c] = pi[c] * 0.5 * 1000 / (dy[c] + cnt[c])

    for i in range(1, num):  # 进行计算
        for j in pos:
            t = 0  # t为从状态j观测到text[i]的发射概率
            if (B[j].__contains__(text[i])):
                t = B[j][text[i]] * 1000
            else:
                t = 0.5 * 1000 / (dy[j] + cnt[j])
            for k in pos:
                if (dp[i][j] < dp[i - 1][k] * A[k][j] * t):  # 使用dp[i-1][k]更新dp[i][j]
                    dp[i][j] = dp[i - 1][k] * A[k][j] * t
                    pre[i][j] = k
    ans = {}  # 存词性分析结果
    MAX = ""  # 得到最大值
    for i in pos:
        if (MAX == "" or dp[num - 1][i] > dp[num - 1][MAX]):
            MAX = i

    for i in range(num - 1, -1, -1):  # 回溯得到分析结果,将词性存于ans中
        ans[i] = MAX
        MAX = pre[i][MAX]
    return ans

# 所有词语
words = []
# 所有的词性
pos = []
# 每个词性出现的频率
cnt = {}
# 初始状态概率分布矩阵
pi = {}
# 状态转移概率矩阵
A = {}
# 符号发射概率矩阵
B = {}

#训练过程
print("训练中......")
file = open("语料.txt",'r')
s = file.readline()  #读取一整行
while (s!=""):
    tmp = s.split(" ")   #将字符串以空格为分割符进行切片
    n = len(tmp)             #n为得到的词数(包括词性)
    for i in range(n):
        word = tmp[i].split('/') #将词和词性分开
        if(len(word)>1):       #有效词
            #print(word[0])
            if(word[0] not in words):
                words.append(word[0])
            if (word[1] not in pos):
                pos.append(word[1])
    s = file.readline()     # 读取下一行

for i in pos:                   #初始化概率矩阵
    pi[i]=0
    cnt[i]=0
    A[i]={}
    B[i]={}
    for j in pos:
        A[i][j]=0
    for j in words:
        B[i][j]=0

#计算概率矩阵
line=0  #总行数
file=open("语料.txt",'r')
s=file.readline()  #读取第一行
while(s!=""):
    if(s=="\n"):                        #为空行则跳过
        continue
    tmp=s.split(" ")
    n=len(tmp)
    line+=1       #行数加1
    flag=True                           #标记是否遇到句子第一个词
    for i in range(n):
        word=tmp[i].split('/')          #将词和词性分开
        pre=tmp[i-1].split('/')         #将上一个词的词和词性分开
        if(len(word)>1):                #有效词
            cnt[word[1]]+=1             #词性word[1]出现次数+1
            B[word[1]][word[0]] += 1    #更新符号发射概率矩阵
            if(flag):                   #当这个词为句子的第一个词时
                pi[word[1]]+=1
                flag=False
            elif(i > 0 and len(pre)>1): #更新状态转移概率矩阵
                A[pre[1]][word[1]]+=1
    s = file.readline()   #读取下一行


dx={}
dy={}
for i in pos:
    dx[i]=0
    dy[i]=0
    pi[i]=pi[i]*1.0/line                #求出初始状态概率分布矩阵
    #数据平滑
    for j in pos:
        if(A[i][j]==0):
            dx[i]+=1
            A[i][j]=0.5
    for j in words:
        if(B[i][j]==0):
            dy[i]+=1
            B[i][j]=0.5
#计算状态转移概率矩阵和符号发射概率矩阵
for i in pos:
    for j in pos:
        A[i][j]=A[i][j]*1.0/(cnt[i]+dx[i])
    for j in words:
        B[i][j]=B[i][j]*1.0/(cnt[i]+dy[i])

print ("训练结束")


while(True):
    op=eval(input("输入以下命令进行操作:\n1.对已经进行标注的文本进行词性标注并比对答案\n2.输入句子进行词性标注\n3.退出\n"))
    if(op==1):
        file_name=input("请输入文件名称:")
        file=open(file_name,'r')
        print("测试中......")
        tot=0             #测试数据中词的总个数
        right_num=0       #正确的个数
        s=file.readline() #读取第一行
        while(s!=""):
            #print(s)
            str = ""    #str为不含词性标注的以空格分隔的句子
            ans1 = {}   #存正确的分词结果
            word_cnt = 0 #词的个数
            tmp = s.split(" ")  # 将字符串以空格为分割符进行切片
            n = len(tmp)        # n为得到的词数(包括词性)
            for i in range(n):
                word = tmp[i].split('/')  # 将词和词性分开
                if (len(word) > 1):  # 有效词
                    if(str==""):
                        str+=word[0]
                    else:
                        str+=" "+word[0]
                    ans1[word_cnt]=word[1]
                    word_cnt+=1
            tot+=word_cnt
            if(str!=""):
                ans=get(str)
                for i in range(word_cnt):   #词性标注结果与正确结果进行比对
                    if(ans[i]==ans1[i]):
                        right_num+=1

            s=file.readline()
        res=1.0*right_num/tot           #计算准确率
        print("准确率为:",res)

    elif(op==2):
        s = input("请输入需要词性标注的句子,以空格分割: ")

        ans = get(s)
        text = s.split(" ")
        num = len(ans)
        for i in range(0, num):                     #输出结果
            print(text[i] +"/" +ans[i] , end=" ")
        print("")

    elif(op==3):
        break
    else:
        print("非法输入!")

This snippet took 0.01 seconds to highlight.

Back to the Entry List or Home.

Delete this entry (admin only).