xiaojincai
/
docxconvert


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165
							#-*-coding:utf-8 -*-
import re,os
import json,uuid
from bs4 import BeautifulSoup
from win32com.client import Dispatch
import pythoncom
from upload_to_oss import TedOSS
import threading 
import shutil
from win32com.client.dynamic import Dispatch, ERRORS_BAD_CONTEXT

class DocxConverter(object):
    """
    """
    def __init__(self,docpath="test4.docx"):
        """
        """
        self.docpath = docpath
        self.oss = TedOSS()
        
    def upload_imgfiles(self,uuidhtml):
        """
        """
        imgroot = os.path.join(os.path.dirname(self.docpath))
        imgdir = os.path.join(imgroot,uuidhtml+".files")
        if os.path.exists(imgdir):
            for file in os.listdir(imgdir):
                imgfile = os.path.join(imgdir,file)
                ossfile = uuidhtml+".files/"+file
                self.oss.upload_from_local(imgfile,ossfile)
            #shutil.rmtree(imgdir)
        pythoncom.CoUninitialize()
        
    def docx2html(self):
        """
        """
        pythoncom.CoInitialize()
        self.word = Dispatch("Word.Application")
        self.word.Visible = 0
        self.doc = self.word.Documents.Open(self.docpath)
        self.uuidhtml = str(uuid.uuid4())
        #html = os.path.join(os.path.dirname(self.docpath),self.uuidhtml+".html")
        pdffile = os.path.join(os.path.dirname(self.docpath),self.uuidhtml+".pdf")
        self.doc.SaveAs(pdffile,17)
        html = os.path.join(os.path.dirname(self.docpath),self.uuidhtml+".html")
        
        from win32com.client.dynamic import Dispatch, ERRORS_BAD_CONTEXT
        
        AvDoc = Dispatch("AcroExch.AVDoc")
        try:
            if AvDoc.Open(pdffile,""):
                pdDoc = AvDoc.GetPDDoc()
                jsObject = pdDoc.GetJSObject()
                jsObject.SaveAs(html,"com.adobe.acrobat.html")
        except Exception as e:
            import traceback
            traceback.print_exc()
            AvDoc.close(True)
        finally:
            AvDoc.Close(True)
            
        self.doc.Close()
        self.word.Quit()
        os.remove(self.docpath)
        #self.upload_imgfiles(self.uuidhtml)
        
        task = threading.Thread(target=self.upload_imgfiles,args=(self.uuidhtml,))
        task.start()
        return html

class QuestionsParser(object):
    """试题解析
    """
    def __init__(self,name="test4.html"):
        self.html = open(name,"r").read()
        self.soup = BeautifulSoup(self.html,"html.parser")

    def get_paragraphs(self):
        """
        """
        wordsection = self.soup.find("div",class_="WordSection1")
        #print wordsection
        pars = wordsection.find_all("p")
        return pars

    def parse_questions(self):
        """提取试题
        """
        que_type_dct = {}
        paragraphs = self.get_paragraphs()
        for i,p in enumerate(paragraphs):
            print p.text
            if u"【题型】" in p.text:
                que_type_dct["type"] = p.text.split("、")[-1] 

    def parse_questions(self):
        """提取试题
        """
        data = []
        tmp_val = {}
        tx_name = ""
        key = ""
        paragraphs = self.get_paragraphs()
        for i,p in enumerate(paragraphs):
            if u"【题型】" in p.text:
                tx_name = p.text 
            if u"【题干】" in p.text:
                key = "tg"
                tmp_val["tx"] = tx_name
                if tmp_val.get("tg"):
                    data.append(tmp_val)
                tmp_val = {"tg":"","tx":"","zsd":"","nd":"","da":"","jx":""}
            if u"【知识点】" in p.text:
                key = "zsd"
            if u"【难度】" in p.text:
                key = "nd"
            if u"【答案】" in p.text:
                key = "da"
            if u"【解析】" in p.text:
                key = "jx"

            if key != "":
                if "<img" in p.__str__():
                    content = p.__str__()
                    host = "http://scxjcclub.oss-cn-beijing.aliyuncs.com/say365/"
                    src = re.search('src=".*\.files.*[\.jpg\.png]"',content).group().split("=")[-1].replace('"','')
                    content = re.sub('src=".*\.files.*[\.jpg\.png]"','src="'+host+src+'"',content)
                    tmp_val[key] += content
                else:
                    tmp_val[key] += p.__str__()

        data.append(tmp_val)
        return data

    def get_questions(self):
        """
        """
        questions = self.parse_questions()
        for que in questions:
            que["tx"] = que["tx"].split(u"、")[-1]
            #que["tg"] = que["tg"].replace(u"【题干】","")
            #que["zsd"] = que["zsd"].replace(u"【知识点】","")
            #que["da"] = que["da"].replace(u"【答案】","")
            #que["jx"] = que["jx"].replace(u"【解析】","")
            que["qno"] = self.get_qno(que["tg"])
        return questions

    def get_qno(self,tg):
        """提取题号
        """
        tgsoup = BeautifulSoup(tg,"html.parser")
        tgtext = tgsoup.text
        qno = re.search(r"\d+",tgtext.split(u"、")[0]).group()
        return qno
        
#docxconverter = DocxConverter() 
#questionparser =  QuestionsParser()

if __name__ == "__main__":
    #ques = questionparser.get_questions()
    #with open("t.json","w+") as f:
    #    f.write(json.dumps(ques))
    #docxconverter.docx2html()
    pass