parsedocx.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165
  1. #-*-coding:utf-8 -*-
  2. import re,os
  3. import json,uuid
  4. from bs4 import BeautifulSoup
  5. from win32com.client import Dispatch
  6. import pythoncom
  7. from upload_to_oss import TedOSS
  8. import threading
  9. import shutil
  10. from win32com.client.dynamic import Dispatch, ERRORS_BAD_CONTEXT
  11. class DocxConverter(object):
  12. """
  13. """
  14. def __init__(self,docpath="test4.docx"):
  15. """
  16. """
  17. self.docpath = docpath
  18. self.oss = TedOSS()
  19. def upload_imgfiles(self,uuidhtml):
  20. """
  21. """
  22. imgroot = os.path.join(os.path.dirname(self.docpath))
  23. imgdir = os.path.join(imgroot,uuidhtml+".files")
  24. if os.path.exists(imgdir):
  25. for file in os.listdir(imgdir):
  26. imgfile = os.path.join(imgdir,file)
  27. ossfile = uuidhtml+".files/"+file
  28. self.oss.upload_from_local(imgfile,ossfile)
  29. #shutil.rmtree(imgdir)
  30. pythoncom.CoUninitialize()
  31. def docx2html(self):
  32. """
  33. """
  34. pythoncom.CoInitialize()
  35. self.word = Dispatch("Word.Application")
  36. self.word.Visible = 0
  37. self.doc = self.word.Documents.Open(self.docpath)
  38. self.uuidhtml = str(uuid.uuid4())
  39. #html = os.path.join(os.path.dirname(self.docpath),self.uuidhtml+".html")
  40. pdffile = os.path.join(os.path.dirname(self.docpath),self.uuidhtml+".pdf")
  41. self.doc.SaveAs(pdffile,17)
  42. html = os.path.join(os.path.dirname(self.docpath),self.uuidhtml+".html")
  43. from win32com.client.dynamic import Dispatch, ERRORS_BAD_CONTEXT
  44. AvDoc = Dispatch("AcroExch.AVDoc")
  45. try:
  46. if AvDoc.Open(pdffile,""):
  47. pdDoc = AvDoc.GetPDDoc()
  48. jsObject = pdDoc.GetJSObject()
  49. jsObject.SaveAs(html,"com.adobe.acrobat.html")
  50. except Exception as e:
  51. import traceback
  52. traceback.print_exc()
  53. AvDoc.close(True)
  54. finally:
  55. AvDoc.Close(True)
  56. self.doc.Close()
  57. self.word.Quit()
  58. os.remove(self.docpath)
  59. #self.upload_imgfiles(self.uuidhtml)
  60. task = threading.Thread(target=self.upload_imgfiles,args=(self.uuidhtml,))
  61. task.start()
  62. return html
  63. class QuestionsParser(object):
  64. """试题解析
  65. """
  66. def __init__(self,name="test4.html"):
  67. self.html = open(name,"r").read()
  68. self.soup = BeautifulSoup(self.html,"html.parser")
  69. def get_paragraphs(self):
  70. """
  71. """
  72. wordsection = self.soup.find("div",class_="WordSection1")
  73. #print wordsection
  74. pars = wordsection.find_all("p")
  75. return pars
  76. def parse_questions(self):
  77. """提取试题
  78. """
  79. que_type_dct = {}
  80. paragraphs = self.get_paragraphs()
  81. for i,p in enumerate(paragraphs):
  82. print p.text
  83. if u"【题型】" in p.text:
  84. que_type_dct["type"] = p.text.split("、")[-1]
  85. def parse_questions(self):
  86. """提取试题
  87. """
  88. data = []
  89. tmp_val = {}
  90. tx_name = ""
  91. key = ""
  92. paragraphs = self.get_paragraphs()
  93. for i,p in enumerate(paragraphs):
  94. if u"【题型】" in p.text:
  95. tx_name = p.text
  96. if u"【题干】" in p.text:
  97. key = "tg"
  98. tmp_val["tx"] = tx_name
  99. if tmp_val.get("tg"):
  100. data.append(tmp_val)
  101. tmp_val = {"tg":"","tx":"","zsd":"","nd":"","da":"","jx":""}
  102. if u"【知识点】" in p.text:
  103. key = "zsd"
  104. if u"【难度】" in p.text:
  105. key = "nd"
  106. if u"【答案】" in p.text:
  107. key = "da"
  108. if u"【解析】" in p.text:
  109. key = "jx"
  110. if key != "":
  111. if "<img" in p.__str__():
  112. content = p.__str__()
  113. host = "http://scxjcclub.oss-cn-beijing.aliyuncs.com/say365/"
  114. src = re.search('src=".*\.files.*[\.jpg\.png]"',content).group().split("=")[-1].replace('"','')
  115. content = re.sub('src=".*\.files.*[\.jpg\.png]"','src="'+host+src+'"',content)
  116. tmp_val[key] += content
  117. else:
  118. tmp_val[key] += p.__str__()
  119. data.append(tmp_val)
  120. return data
  121. def get_questions(self):
  122. """
  123. """
  124. questions = self.parse_questions()
  125. for que in questions:
  126. que["tx"] = que["tx"].split(u"、")[-1]
  127. #que["tg"] = que["tg"].replace(u"【题干】","")
  128. #que["zsd"] = que["zsd"].replace(u"【知识点】","")
  129. #que["da"] = que["da"].replace(u"【答案】","")
  130. #que["jx"] = que["jx"].replace(u"【解析】","")
  131. que["qno"] = self.get_qno(que["tg"])
  132. return questions
  133. def get_qno(self,tg):
  134. """提取题号
  135. """
  136. tgsoup = BeautifulSoup(tg,"html.parser")
  137. tgtext = tgsoup.text
  138. qno = re.search(r"\d+",tgtext.split(u"、")[0]).group()
  139. return qno
  140. #docxconverter = DocxConverter()
  141. #questionparser = QuestionsParser()
  142. if __name__ == "__main__":
  143. #ques = questionparser.get_questions()
  144. #with open("t.json","w+") as f:
  145. # f.write(json.dumps(ques))
  146. #docxconverter.docx2html()
  147. pass