| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146 |
- #-*-coding:utf-8 -*-
- import re,os
- import json,uuid
- from bs4 import BeautifulSoup
- from win32com.client import Dispatch
- import pythoncom
- from upload_to_oss import TedOSS
- import threading
- import shutil
- class DocxConverter(object):
- """
- """
- def __init__(self,docpath="test4.docx"):
- """
- """
- self.docpath = docpath
- self.oss = TedOSS()
-
- def upload_imgfiles(self,uuidhtml):
- """
- """
- imgroot = os.path.join(os.path.dirname(self.docpath))
- imgdir = os.path.join(imgroot,uuidhtml+".files")
- if os.path.exists(imgdir):
- for file in os.listdir(imgdir):
- imgfile = os.path.join(imgdir,file)
- ossfile = uuidhtml+".files/"+file
- self.oss.upload_from_local(imgfile,ossfile)
- shutil.rmtree(imgdir)
- pythoncom.CoUninitialize()
-
- def docx2html(self):
- """
- """
- pythoncom.CoInitialize()
- self.word = Dispatch("Word.Application")
- self.word.Visible = 0
- self.doc = self.word.Documents.Open(self.docpath)
- self.uuidhtml = str(uuid.uuid4())
- html = os.path.join(os.path.dirname(self.docpath),self.uuidhtml+".html")
- self.doc.SaveAs(html,10)
- self.doc.Close()
- self.word.Quit()
- os.remove(self.docpath)
- #self.upload_imgfiles(self.uuidhtml)
-
- task = threading.Thread(target=self.upload_imgfiles,args=(self.uuidhtml,))
- task.start()
- return html
- class QuestionsParser(object):
- """试题解析
- """
- def __init__(self,name="test4.html"):
- self.html = open(name,"r").read()
- self.soup = BeautifulSoup(self.html,"html.parser")
- def get_paragraphs(self):
- """
- """
- wordsection = self.soup.find("div",class_="WordSection1")
- #print wordsection
- pars = wordsection.find_all("p")
- return pars
- def parse_questions(self):
- """提取试题
- """
- que_type_dct = {}
- paragraphs = self.get_paragraphs()
- for i,p in enumerate(paragraphs):
- print p.text
- if u"【题型】" in p.text:
- que_type_dct["type"] = p.text.split("、")[-1]
- def parse_questions(self):
- """提取试题
- """
- data = []
- tmp_val = {}
- tx_name = ""
- key = ""
- paragraphs = self.get_paragraphs()
- for i,p in enumerate(paragraphs):
- if u"【题型】" in p.text:
- tx_name = p.text
- if u"【题干】" in p.text:
- key = "tg"
- tmp_val["tx"] = tx_name
- if tmp_val.get("tg"):
- data.append(tmp_val)
- tmp_val = {"tg":"","tx":"","zsd":"","nd":"","da":"","jx":""}
- if u"【知识点】" in p.text:
- key = "zsd"
- if u"【难度】" in p.text:
- key = "nd"
- if u"【答案】" in p.text:
- key = "da"
- if u"【解析】" in p.text:
- key = "jx"
- if key != "":
- if "<img" in p.__str__():
- content = p.__str__()
- host = "http://scxjcclub.oss-cn-beijing.aliyuncs.com/say365/"
- src = re.search('src=".*\.files.*[\.jpg\.png]"',content).group().split("=")[-1].replace('"','')
- content = re.sub('src=".*\.files.*[\.jpg\.png]"','src="'+host+src+'"',content)
- tmp_val[key] += content
- else:
- tmp_val[key] += p.__str__()
- data.append(tmp_val)
- return data
- def get_questions(self):
- """
- """
- questions = self.parse_questions()
- for que in questions:
- que["tx"] = que["tx"].split(u"、")[-1]
- #que["tg"] = que["tg"].replace(u"【题干】","")
- #que["zsd"] = que["zsd"].replace(u"【知识点】","")
- #que["da"] = que["da"].replace(u"【答案】","")
- #que["jx"] = que["jx"].replace(u"【解析】","")
- que["qno"] = self.get_qno(que["tg"])
- return questions
- def get_qno(self,tg):
- """提取题号
- """
- tgsoup = BeautifulSoup(tg,"html.parser")
- tgtext = tgsoup.text
- qno = re.search(r"\d+",tgtext.split(u"、")[0]).group()
- return qno
-
- #docxconverter = DocxConverter()
- #questionparser = QuestionsParser()
- if __name__ == "__main__":
- #ques = questionparser.get_questions()
- #with open("t.json","w+") as f:
- # f.write(json.dumps(ques))
- #docxconverter.docx2html()
- pass
|