#-*-coding:utf-8 -*- import re,os import json,uuid from bs4 import BeautifulSoup from win32com.client import Dispatch import pythoncom from upload_to_oss import TedOSS import threading import shutil class DocxConverter(object): """ """ def __init__(self,docpath="test4.docx"): """ """ self.docpath = docpath self.oss = TedOSS() def upload_imgfiles(self,uuidhtml): """ """ imgroot = os.path.join(os.path.dirname(self.docpath)) imgdir = os.path.join(imgroot,uuidhtml+".files") if os.path.exists(imgdir): for file in os.listdir(imgdir): imgfile = os.path.join(imgdir,file) ossfile = uuidhtml+".files/"+file self.oss.upload_from_local(imgfile,ossfile) shutil.rmtree(imgdir) pythoncom.CoUninitialize() def docx2html(self): """ """ pythoncom.CoInitialize() self.word = Dispatch("Word.Application") self.word.Visible = 0 self.doc = self.word.Documents.Open(self.docpath) self.uuidhtml = str(uuid.uuid4()) html = os.path.join(os.path.dirname(self.docpath),self.uuidhtml+".html") self.doc.SaveAs(html,10) self.doc.Close() self.word.Quit() os.remove(self.docpath) #self.upload_imgfiles(self.uuidhtml) task = threading.Thread(target=self.upload_imgfiles,args=(self.uuidhtml,)) task.start() return html class QuestionsParser(object): """试题解析 """ def __init__(self,name="test4.html"): self.html = open(name,"r").read() self.soup = BeautifulSoup(self.html,"html.parser") def get_paragraphs(self): """ """ wordsection = self.soup.find("div",class_="WordSection1") #print wordsection pars = wordsection.find_all("p") return pars def parse_questions(self): """提取试题 """ que_type_dct = {} paragraphs = self.get_paragraphs() for i,p in enumerate(paragraphs): print p.text if u"【题型】" in p.text: que_type_dct["type"] = p.text.split("、")[-1] def parse_questions(self): """提取试题 """ data = [] tmp_val = {} tx_name = "" key = "" paragraphs = self.get_paragraphs() for i,p in enumerate(paragraphs): if u"【题型】" in p.text: tx_name = p.text if u"【题干】" in p.text: key = "tg" tmp_val["tx"] = tx_name if tmp_val.get("tg"): data.append(tmp_val) tmp_val = {"tg":"","tx":"","zsd":"","nd":"","da":"","jx":""} if u"【知识点】" in p.text: key = "zsd" if u"【难度】" in p.text: key = "nd" if u"【答案】" in p.text: key = "da" if u"【解析】" in p.text: key = "jx" if key != "": if "