#-*-coding:utf-8 -*- import re,os import json,uuid from bs4 import BeautifulSoup from win32com.client import Dispatch import pythoncom class DocxConverter(object): """ """ def __init__(self,docpath="test4.docx"): """ """ self.docpath = docpath def docx2html(self): """ """ pythoncom.CoInitialize() self.word = Dispatch("Word.Application") self.word.Visible = 0 self.doc = self.word.Documents.Open(self.docpath) html = os.path.join(os.path.dirname(self.docpath),str(uuid.uuid4())+".html") self.doc.SaveAs(html,10) self.doc.Close() self.word.Quit() pythoncom.CoUninitialize() return html class QuestionsParser(object): """试题解析 """ def __init__(self,name="test4.html"): self.html = open(name,"r").read() self.soup = BeautifulSoup(self.html,"html.parser") def get_paragraphs(self): """ """ wordsection = self.soup.find("div",class_="WordSection1") #print wordsection pars = wordsection.find_all("p") return pars def parse_questions(self): """提取试题 """ que_type_dct = {} paragraphs = self.get_paragraphs() for i,p in enumerate(paragraphs): print p.text if u"【题型】" in p.text: que_type_dct["type"] = p.text.split("、")[-1] def parse_questions(self): """提取试题 """ data = [] tmp_val = {} tx_name = "" key = "" paragraphs = self.get_paragraphs() for i,p in enumerate(paragraphs): if u"【题型】" in p.text: tx_name = p.text if u"【题干】" in p.text: key = "tg" tmp_val["tx"] = tx_name if tmp_val.get("tg"): data.append(tmp_val) tmp_val = {"tg":"","tx":"","zsd":"","nd":"","da":"","jx":""} if u"【知识点】" in p.text: key = "zsd" if u"【难度】" in p.text: key = "nd" if u"【答案】" in p.text: key = "da" if u"【解析】" in p.text: key = "jx" if key != "": tmp_val[key] += p.__str__() data.append(tmp_val) return data def get_questions(self): """ """ questions = self.parse_questions() for que in questions: que["tx"] = que["tx"].split(u"、")[-1] #que["tg"] = que["tg"].replace(u"【题干】","") #que["zsd"] = que["zsd"].replace(u"【知识点】","") #que["da"] = que["da"].replace(u"【答案】","") #que["jx"] = que["jx"].replace(u"【解析】","") que["qno"] = self.get_qno(que["tg"]) return questions def get_qno(self,tg): """提取题号 """ tgsoup = BeautifulSoup(tg,"html.parser") tgtext = tgsoup.text qno = re.search(r"\d+",tgtext.split(u"、")[0]).group() return qno #docxconverter = DocxConverter() #questionparser = QuestionsParser() if __name__ == "__main__": #ques = questionparser.get_questions() #with open("t.json","w+") as f: # f.write(json.dumps(ques)) #docxconverter.docx2html() pass