失眠网 > Word转Latex：基于Python的半自动化排版

Word转Latex：基于Python的半自动化排版

时间：2021-12-30 04:55:36

Latex因其公式排版整洁，受到各类研究者的广泛使用。然而由于latex排版非实时可见，进而有可能造成写论文时用word排版，提交论文时转成latex排版的情况。而由于word和latex排版方式不完全一致，因而对研究者投稿进度造成一定的延误。基于个人经验，word转Latex的一大难题就是公式排版转化，本文拟借助Python来实现这一方面的自动转化。

第一步：生成测试文档

新建一个txt文件。将包含公式的测试文本从word复制到txt中。测试文本如下：

Let L(q,Q)=1-\bar{L}(q,Q), where \bar{L}(q,Q)=\bar{F}(q) \bar{G}(q/Q). For any given Q, let l_q (q)=\partial L(q,Q)/\partial q=f(q) \bar{G}(q/Q)+(\bar{F}(q))/Q. According to assumptions of f(∙) and g(∙), we know f(q)=\bar{F}(q)h(q) and g(q/Q)=\bar{G}(q/Q)k(q/Q). Further, its hazard function M(q)=(ql_q (q))/(\bar{L}(q,Q) )=H(q)+K(q/Q), and M(q)=H(q)+(K (q/Q))/Q>0. Thus, M(q) is increasing in q≥0. For any given q, let l_Q (Q)=\partial L(q,Q)/\partial Q= \bar{F}(q)g(q/Q)=-q/Q \bar{F}(q)k(q/Q) \bar{G}(q/Q)= \bar{L}(q,Q)K(q/Q). Further, its hazard function N(Q)=(Ql_Q (Q))/(\bar{L}(q,Q) )=-K (q/Q), which is increasing in Q≥0.〖

同时，导入可能用到的包，并定义一些后期用到的列表。

import string #关于字符串的处理函数import os #与操作系统/文件交互的一个接口mathnota=["_", "^","+","-","≤","≥","<",">","=","/","∙","¯",")","(","$"]brac_nota=[")","("]extra_nota=["w","q","Q","k","yQ"]punc_nota=[",","."]

第二步：文件清洗

在将文本从Word复制到txt过程中，有时会出现部分字符异常，出现诸如〖"，"〗"等字符，因此需要事先删除。

def clean(file):#file是读取的txt文件clean_set=["〖","〗","■"] #给出异常字符集合new_file=[] #存储去除掉异常字符的文本for paragraphs in f1.readlines(): #paragraphs是拆成的段paragraphs = paragraphs.strip("\n") #去掉换行符new_paragraph=[]words = paragraphs.split() #words是段变成字词。拆过程中，以空格作为分隔符。for index_word, content_word in enumerate(words): #把每个字词单独形成一个listnew_word=[];for index_charc,word_charc in enumerate(content_word): #检查每个字词中的字符串if word_charc not in clean_set:#出现异常值new_word.append(word_charc)new_word=''.join(new_word)new_paragraph.append(new_word)new_paragraph=' '.join(new_paragraph)new_file.append(new_paragraph)return new_file

第三步，删除空格造成的公式分割

同样得，在将文本从Word复制到txt过程中，字词之间可能会出现多余的空格，进而影响到后期的公式识别。因此，为了保证公式识别的准确性，需事先对异常空格进行识别并删除。

def delete_empty(file):new_file=[]for index_paragraphs, content_paragraphs in enumerate(file):#print(index_paragraphs)content_paragraphs = content_paragraphs.split() #words是段变成字词。拆过程中，以空格作为分隔符。math=[0]*(len(content_paragraphs)) #多设置一个位置new_paragraph=[]#print (math_index)for index_word, content_word in enumerate(content_paragraphs): #把每个字词单独形成一个list#print(index_word)#print(content_word)for index_charc,word_charc in enumerate(content_word): #检查每个字词中的字符串if (word_charc in mathnota) or (content_word in extra_nota): #含有特定字符, #前后两个字符串不能以符号、括号间隔math[index_word]=1breakfor index_math in range(0,len(math)):#print(index_math)#print(content_paragraphs[index_math-1])if math[index_math-1]==1 and math[index_math]==1: #1与1new_paragraph.append(content_paragraphs[index_math])else: #0与0，或者1与0new_paragraph.append(" "+content_paragraphs[index_math])#print(new_paragraph)#print(new_paragraph)new_paragraph="".join(new_paragraph)new_file.append(new_paragraph)return new_file

第四步：识别公式，并在其前后增添$符号

def add_notation(file):new_file=[]for index_paragraphs, content_paragraphs in enumerate(file):content_paragraphs = content_paragraphs.split() #words是段变成字词。拆过程中，以空格作为分隔符。new_paragraph=[]for index_word, content_word in enumerate(content_paragraphs): #把每个字词单独形成一个listif content_word in extra_nota:new_paragraph.append("$"+content_word+"$")else:for index_charc,word_charc in enumerate(content_word): #检查每个字词中的字符串if word_charc in mathnota and (word_charc not in brac_nota):#含有特定字符if content_word[-1] in punc_nota:#最后一个字符是标点符号new_paragraph.append("$"+content_word[0:-1]+"$"+content_word[-1])breakelse:new_paragraph.append("$"+content_word+"$")breakelif index_charc==len(content_word)-1:new_paragraph.append(content_word)breaknew_paragraph=' '.join(new_paragraph)new_file.append(new_paragraph)return new_file

第五步：对分数形式进行识别，并调整代码

def select_value(index_left,index_right,place_slash,judge):if judge==2:# 前有，寻找对应的(， k(q/Q)right_position=-1left_position=-1print(index_left)print(index_right)right_value=index_right[right_position]left_value=index_left[left_position]while (right_value>left_value):#if right_position<left_position:if right_position==left_position:if abs(right_position)==len(index_right):right_value=index_right[0]breakelse:right_position=right_position-1right_value=index_right[right_position]else:if abs(left_position)==len(index_left):left_value=index_left[0]breakelse:left_position=left_position-1left_value=index_left[left_position]return left_valueelse: #后有right_position=0left_position=0right_value=index_right[right_position]left_value=index_left[left_position]while (right_value>left_value):#if right_position<left_position:if right_position==left_position:if abs(left_position)==len(index_left):left_value=index_left[-1]breakelse:left_position=left_position+1left_value=index_left[left_position] else:if abs(right_position)==len(index_right):right_value=index_right[-1]breakelse:right_position=right_position+1right_value=index_right[right_position]return right_valuedef fractile(file):#最开始需要判断()是否多余,例如q/Q, q/(Q)#然而在word转txt公式中，如果/公式上下没有符号，就不会自动添加括号#因此可以认为txt中的公式都不多余#math_index.append(0) #判断每个字符串是否是公式，0不是，1是new_file=[]for index_paragraphs, content_paragraphs in enumerate(file):content_paragraphs = content_paragraphs.split() #words是段变成字词。拆过程中，以空格作为分隔符。new_paragraph=[]for index_word, content_word in enumerate(content_paragraphs): #把每个字词单独形成一个listnum_left_parenthesis=[] #应对除法num_right_parenthesis=[]num_left=0num_right=0state_right_paranthesis=0 #无变化state_left_paranthesis=0 #后半部专用，无变化0#divide=0 #判断是否出现分数形式new_word=[]index_left_parenthesis=[]index_right_parenthesis=[]index_slash=[]index_math=[]num_slash=1for index_charc,word_charc in enumerate(content_word): #检查每个字词中的字符串new_word.append(word_charc)if word_charc=="(":index_left_parenthesis.append(index_charc)elif word_charc==")":index_right_parenthesis.append(index_charc)elif word_charc=="/":index_slash.append(index_charc)num_slash=num_slash+1elif word_charc in mathnota: #所有计算符号位置，除了上述几个index_math.append(index_charc)if index_slash:# 存在值即为True#index_left_parenthesis=np.array(index_left_parenthesis).reshape(len(index_slash),int((len(index_left_parenthesis))/2))#index_right_parenthesis=np.array(index_right_parenthesis).reshape(len(index_slash),int((len(index_right_parenthesis))/2))print(index_left_parenthesis)print(index_right_parenthesis)print(index_slash)#print(new_word)for item,item_slash in enumerate(index_slash):print ("60=",item_slash)left_upper=0left_lower=left_upperprint("1aa")print(index_left_parenthesis)print(left_upper)while index_left_parenthesis[left_upper]<item_slash:if left_upper<len(index_left_parenthesis)-1:left_upper=left_upper+1#print(left_upper)#print("1bb")else:breakprint(left_upper)#print("cc")right_lower=left_lowerright_upper=left_upperif new_word[item_slash-1]==")": #前有，需判断下是什么性质的#print(index_left_parenthesis)#print(left_upper)#print(index_right_parenthesis)#如果括号里面没有数学符号count_move=1count_math=0count_right=0while new_word[item_slash-count_move]!="(":#print(new_word[item_slash-count_move])count_move=count_move+1if new_word[item_slash-count_move] in mathnota and new_word[item_slash-count_move]!="(":count_math=count_math+1if new_word[item_slash-count_move]==")":count_right=count_right+1if count_right>=2:count_math=count_math+1break#print(new_word[item_slash-count_move])print(count_math)print(count_move)print(count_right)if count_math==0:#如果括号里面没有数学符号#print(item_slash)#print("abc")count_move=0#重置while new_word[item_slash-count_move]!="=": count_move=count_move+1if (item_slash-count_move)<=0:break#print(new_word[item_slash-count_move])left_insert=item_slash-count_moveprint("zzz")#print(new_word[left_insert])#print(left_insert)#print(item_slash)#print(count_move)new_word[left_insert]="=\\frac{"new_word[item_slash]="}"print(new_word)else:#如果括号里面有数学符号print("bcd")print(len(index_left_parenthesis))if len(index_left_parenthesis)==1:left_insert=index_left_parenthesis[0]print("1111")else:print("2222")print(index_left_parenthesis[left_lower:left_upper])print(left_lower)print(left_upper)print(index_right_parenthesis[right_lower:right_upper])left_insert=select_value(index_left_parenthesis[left_lower:left_upper],index_right_parenthesis[right_lower:right_upper],item_slash,2) print(left_insert)new_word[left_insert]="\\frac{"#print(new_word[item_slash-1])new_word[item_slash-1]="}"#print(new_word[item_slash])new_word[item_slash]=""print (new_word)#开始后半部分if new_word[item_slash+1]=="(":new_word[item_slash+1]="{"#print(index_left_parenthesis)#print(left_upper)#print(len(index_left_parenthesis))if len(index_left_parenthesis)==1:right_insert=index_right_parenthesis[0]else:if left_upper==len(index_left_parenthesis)-1:right_insert=index_right_parenthesis[right_upper] else:right_insert=select_value(index_left_parenthesis[left_upper:len(index_left_parenthesis)],index_right_parenthesis[right_upper:len(index_right_parenthesis)],item_slash,4)#print (right_insert)new_word[right_insert]=")}"else: #后无print("det")print(new_word)print(new_word[item_slash])print(item_slash)new_word[item_slash]=new_word[item_slash]+"{" count_move=0while new_word[item_slash+count_move] not in mathnota:print(new_word[item_slash+count_move])if (item_slash+count_move)>=len(new_word)-1:breakcount_move=count_move+1#print("fff")#print(new_word[item_slash+count_move])#print(item_slash+count_move)#print(count_move)new_word[item_slash+count_move]="}"+new_word[item_slash+count_move]else: #前无例子 F(q/(Q)), 3+q/Q, q/Q#仅需判定最近的(位置count_move=1while new_word[item_slash-count_move] not in mathnota:#print(new_word[item_slash-count_move])count_move=count_move+1left_insert=item_slash-count_moveprint("ddef")new_word[left_insert]="(\\frac{"new_word[item_slash]="}"#print("ee")print(new_word)#开始后半部分 if new_word[item_slash+1]=="(":print("xxx")new_word[item_slash+1]="{"#print(index_left_parenthesis)#print(left_upper)#print(len(index_left_parenthesis))if len(index_left_parenthesis)==1:right_insert=index_right_parenthesis[0]else:if left_upper==len(index_left_parenthesis)-1:right_insert=index_right_parenthesis[right_upper] else:right_insert=select_value(index_left_parenthesis[left_upper:len(index_left_parenthesis)],index_right_parenthesis[right_upper:len(index_right_parenthesis)],item_slash,4)#print (right_insert)new_word[right_insert]=")}"else: #后无print("yyy")new_word[item_slash]=new_word[item_slash]+"{" count_move=0while new_word[item_slash+count_move] not in mathnota:count_move=count_move+1#if new_word[item_slash+count_move]=="Q":#breakprint("fff")print(new_word[item_slash])print(new_word[item_slash+count_move])print(item_slash+count_move)print(count_move)new_word[item_slash+count_move]="}"+new_word[item_slash+count_move]# q=f(q) \bar{G}(q/Q)+(\bar{F}(q))/Q g(q/Q)##没有elsenew_word=''.join(new_word)new_paragraph.append(new_word)new_paragraph=' '.join(new_paragraph)#print(new_paragraph)new_file.append(new_paragraph)return new_file

第六步：对特定符号进行代码修正

def replace(file): #file为列表形式#replace_before=["∈","≥","≤","∂","∞"]replace_before=["∈","≥","≤","∂","\partial","∞","α","β"] #¯replace_after=["\in ","\geq ","\leq ","\partial ","\partial ","\infty ","\alpha ","\beta"] #$\bar{F}$ ,"\bar{}"new_file=[]for index_paragraphs, content_paragraphs in enumerate(file): #检查每个字词中的字符串print(index_paragraphs)new_paragraph=[]content_paragraphs = content_paragraphs.split()for index_word, content_word in enumerate(content_paragraphs): #把每个字词单独形成一个listprint(index_word)for shizaixiangbulaile in range(len(replace_before)):if replace_before[shizaixiangbulaile] in content_word:#sequnce=content_word.find(replace_before[shizaixiangbulaile])print(content_word)print(replace_before[shizaixiangbulaile])print(replace_after[shizaixiangbulaile])content_word=content_word.replace(replace_before[shizaixiangbulaile],replace_after[shizaixiangbulaile])print(content_word)new_paragraph.append(content_word)print(new_paragraph)new_paragraph=' '.join(new_paragraph)new_file.append(new_paragraph)return new_file

第七步：生成主程序

with open('before.txt','r',encoding="utf-8") as f1, open('after.txt','wb') as bs: #f1是需要转换的原文，f2是最终生成的文档f2=clean(f1) #清洗文本f3=delete_empty(f2) #删除公式中的空格f4=add_notation(f3) #添加$f5=fractile(f4)f6=replace(f5) #替换数学符号f7 = str.encode(''.join(f6))bs.write(f7)bs.close()

题外话：由于并未大规模测试代码，所以其适用性存在问题（百分之百有问题）。当然，还有两种更简单的排版策略，其一，直接花钱买mathtype插件。该插件具备一键转换Tex功能。其二，让学生帮忙改，别问我怎么知道的。

如果觉得《Word转Latex：基于Python的半自动化排版》对你有帮助，请点赞、收藏，并留下你的观点哦！

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。