xiaojincai
/
docxconvert


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255
							#!-*-coding:utf-8 -*-
import os
import sys
import time
import socket
import json
import logging
import inspect
import winerror
import win32event
import win32service
import servicemanager
import win32serviceutil

from flask import Flask,request

from tornado.ioloop import IOLoop
from tornado.wsgi import WSGIContainer
from tornado.httpserver import HTTPServer

#from parsedocx import DocxConverter,QuestionsParser

import re,os
import json,uuid
from bs4 import BeautifulSoup
from win32com.client import Dispatch,DispatchEx
import pythoncom


class DocxConverter(object):
    """
    """
    def __init__(self,docpath="test4.docx"):
        """
        """
        self.docpath = docpath

    def docx2html(self):
        """
        """
        
        self.word = DispatchEx("Word.Application")
        self.word.Visible = 0
        xlog.error(self.docpath)
        xlog.info(self.word)
        self.doc = self.word.Documents.Open(self.docpath)
        xlog.error(self.doc)
        html = os.path.join(os.path.dirname(self.docpath),str(uuid.uuid4())+".html")
        self.doc.SaveAs(html,10)
        self.doc.Close()
        self.word.Quit()
        return html

class QuestionsParser(object):
    """试题解析
    """
    def __init__(self,name="test4.html"):
        self.html = open(name,"r").read()
        self.soup = BeautifulSoup(self.html,"html.parser")

    def get_paragraphs(self):
        """
        """
        wordsection = self.soup.find("div",class_="WordSection1")
        #print wordsection
        pars = wordsection.find_all("p")
        return pars

    def parse_questions(self):
        """提取试题
        """
        que_type_dct = {}
        paragraphs = self.get_paragraphs()
        for i,p in enumerate(paragraphs):
            print p.text
            if u"【题型】" in p.text:
                que_type_dct["type"] = p.text.split("、")[-1] 

    def parse_questions(self):
        """提取试题
        """
        data = []
        tmp_val = {}
        tx_name = ""
        key = ""
        paragraphs = self.get_paragraphs()
        for i,p in enumerate(paragraphs):
            if u"【题型】" in p.text:
                tx_name = p.text 
            if u"【题干】" in p.text:
                key = "tg"
                tmp_val["tx"] = tx_name
                if tmp_val.get("tg"):
                    data.append(tmp_val)
                tmp_val = {"tg":"","tx":"","zsd":"","nd":"","da":"","jx":""}
            if u"【知识点】" in p.text:
                key = "zsd"
            if u"【难度】" in p.text:
                key = "nd"
            if u"【答案】" in p.text:
                key = "da"
            if u"【解析】" in p.text:
                key = "jx"

            if key != "":
                tmp_val[key] += p.__str__()

        data.append(tmp_val)

        return data

    def get_questions(self):
        """
        """
        questions = self.parse_questions()
        for que in questions:
            que["tx"] = que["tx"].split(u"、")[-1]
            #que["tg"] = que["tg"].replace(u"【题干】","")
            #que["zsd"] = que["zsd"].replace(u"【知识点】","")
            #que["da"] = que["da"].replace(u"【答案】","")
            #que["jx"] = que["jx"].replace(u"【解析】","")
            que["qno"] = self.get_qno(que["tg"])
        return questions

    def get_qno(self,tg):
        """提取题号
        """
        tgsoup = BeautifulSoup(tg,"html.parser")
        tgtext = tgsoup.text
        qno = re.search(r"\d+",tgtext.split(u"、")[0]).group()
        return qno


app = Flask(__name__)
root = "c:\\AppData\\say365"
xlog = logging.getLogger('[PythonService]')
handler = logging.FileHandler(os.path.join(root, "service.log"))

formatter = logging.Formatter('%(asctime)s %(name)-12s %(levelname)-8s %(message)s')
handler.setFormatter(formatter)

xlog.addHandler(handler)
xlog.setLevel(logging.INFO)

@app.route('/parsedocx',methods=["POST"])
def parsedocx():
    """
    """
    try:
        fobj = request.files['file']
        if not os.path.exists(root):
            os.makedirs(root)
        docxname = os.path.join(root,str(int(time.time()*1000))+os.path.splitext(fobj.filename)[-1])
        with open(docxname,"wb+") as doc:
            doc.write(fobj.read())
            
        docxconv = DocxConverter(docxname)
        html = docxconv.docx2html()
        parser = QuestionsParser(html)
        questions = parser.get_questions()
        res = json.dumps(questions)
        xlog.info("test")
        
        
        return res
    except Exception as e:
        xlog.error(e)
        return str(e)
    

def main():
    #app.run(host='0.0.0.0', port=8002, debug=True)
    s = HTTPServer(WSGIContainer(app))
    s.listen(8002)
    IOLoop.current().start()
 
class XsacnService(win32serviceutil.ServiceFramework): 
    #服务名
    _svc_name_ = "XsacnService"
    #服务在windows系统中显示的名称
    _svc_display_name_ = "XsacnService"
    #服务的描述
    _svc_description_ = "XsacnService"
 
    def __init__(self, args): 
        win32serviceutil.ServiceFramework.__init__(self, args)
        self.stop_event = win32event.CreateEvent(None, 0, 0, None)
        socket.setdefaulttimeout(60)  # 套接字设置默认超时时间
        self.logger = self._getLogger()  # 获取日志对象
        self.isAlive = True
        
    def _getLogger(self):
        # 设置日志功能
        logger = logging.getLogger('[PythonService]')

        this_file = inspect.getfile(inspect.currentframe())
        dirpath = os.path.abspath(os.path.dirname(this_file))
        handler = logging.FileHandler(os.path.join(dirpath, "service.log"))

        formatter = logging.Formatter('%(asctime)s %(name)-12s %(levelname)-8s %(message)s')
        handler.setFormatter(formatter)

        logger.addHandler(handler)
        logger.setLevel(logging.INFO)

        return logger
 
    def SvcDoRun(self):
        # 把自己的代码放到这里，就OK
        # 等待服务被停止
        #self.main()
        #win32event.WaitForSingleObject(self.hWaitStop, win32event.INFINITE)
        pythoncom.CoInitialize()
        while self.isAlive:
            self.logger.info("服务正在运行...")
            sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            result = sock.connect_ex(('127.0.0.1', 8002))  # 嗅探网址是否可以访问，成功返回0，出错返回错误码
            if result != 0:
                # Python3.8的asyncio改变了循环方式，因为这种方式在windows上不支持相应的add_reader APIs，就会抛出NotImplementedError错误。
                # 因此加入下面两行代码
                #if sys.platform == 'win32':
                #    asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
                self.main()
            sock.close()
            time.sleep(20)
            
    def main(self):
        #app.run(host='0.0.0.0', port=8002, debug=True)
        s = HTTPServer(WSGIContainer(app))
        s.listen(8002)
        IOLoop.current().start()            
            
    def SvcStop(self): 
        pythoncom.CoUninitialize()
        self.ReportServiceStatus(win32service.SERVICE_STOP_PENDING)  # 先告诉SCM停止这个过程
        win32event.SetEvent(self.stop_event)  # 设置事件
        self.ReportServiceStatus(win32service.SERVICE_STOPPED)  # 确保停止，也可不加
        self.isAlive = False
        
 
if __name__=='__main__': 
    if len(sys.argv) == 1:
        try:
            evtsrc_dll = os.path.abspath(servicemanager.__file__)
            servicemanager.PrepareToHostSingle(XsacnService)
            servicemanager.Initialize('XsacnService', evtsrc_dll)
            servicemanager.StartServiceCtrlDispatcher()
        except win32service.error, details:
            if details[0] == winerror.ERROR_FAILED_SERVICE_CONTROLLER_CONNECT:
                win32serviceutil.usage()
    else:
        win32serviceutil.HandleCommandLine(XsacnService)
    #main()