parsedocx.py 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119
  1. #-*-coding:utf-8 -*-
  2. import re,os
  3. import json,uuid
  4. from bs4 import BeautifulSoup
  5. from win32com.client import Dispatch
  6. import pythoncom
  7. class DocxConverter(object):
  8. """
  9. """
  10. def __init__(self,docpath="test4.docx"):
  11. """
  12. """
  13. self.docpath = docpath
  14. def docx2html(self):
  15. """
  16. """
  17. pythoncom.CoInitialize()
  18. self.word = Dispatch("Word.Application")
  19. self.word.Visible = 0
  20. self.doc = self.word.Documents.Open(self.docpath)
  21. html = os.path.join(os.path.dirname(self.docpath),str(uuid.uuid4())+".html")
  22. self.doc.SaveAs(html,10)
  23. self.doc.Close()
  24. self.word.Quit()
  25. pythoncom.CoUninitialize()
  26. return html
  27. class QuestionsParser(object):
  28. """试题解析
  29. """
  30. def __init__(self,name="test4.html"):
  31. self.html = open(name,"r").read()
  32. self.soup = BeautifulSoup(self.html,"html.parser")
  33. def get_paragraphs(self):
  34. """
  35. """
  36. wordsection = self.soup.find("div",class_="WordSection1")
  37. #print wordsection
  38. pars = wordsection.find_all("p")
  39. return pars
  40. def parse_questions(self):
  41. """提取试题
  42. """
  43. que_type_dct = {}
  44. paragraphs = self.get_paragraphs()
  45. for i,p in enumerate(paragraphs):
  46. print p.text
  47. if u"【题型】" in p.text:
  48. que_type_dct["type"] = p.text.split("、")[-1]
  49. def parse_questions(self):
  50. """提取试题
  51. """
  52. data = []
  53. tmp_val = {}
  54. tx_name = ""
  55. key = ""
  56. paragraphs = self.get_paragraphs()
  57. for i,p in enumerate(paragraphs):
  58. if u"【题型】" in p.text:
  59. tx_name = p.text
  60. if u"【题干】" in p.text:
  61. key = "tg"
  62. tmp_val["tx"] = tx_name
  63. if tmp_val.get("tg"):
  64. data.append(tmp_val)
  65. tmp_val = {"tg":"","tx":"","zsd":"","nd":"","da":"","jx":""}
  66. if u"【知识点】" in p.text:
  67. key = "zsd"
  68. if u"【难度】" in p.text:
  69. key = "nd"
  70. if u"【答案】" in p.text:
  71. key = "da"
  72. if u"【解析】" in p.text:
  73. key = "jx"
  74. if key != "":
  75. tmp_val[key] += p.__str__()
  76. data.append(tmp_val)
  77. return data
  78. def get_questions(self):
  79. """
  80. """
  81. questions = self.parse_questions()
  82. for que in questions:
  83. que["tx"] = que["tx"].split(u"、")[-1]
  84. #que["tg"] = que["tg"].replace(u"【题干】","")
  85. #que["zsd"] = que["zsd"].replace(u"【知识点】","")
  86. #que["da"] = que["da"].replace(u"【答案】","")
  87. #que["jx"] = que["jx"].replace(u"【解析】","")
  88. que["qno"] = self.get_qno(que["tg"])
  89. return questions
  90. def get_qno(self,tg):
  91. """提取题号
  92. """
  93. tgsoup = BeautifulSoup(tg,"html.parser")
  94. tgtext = tgsoup.text
  95. qno = re.search(r"\d+",tgtext.split(u"、")[0]).group()
  96. return qno
  97. #docxconverter = DocxConverter()
  98. #questionparser = QuestionsParser()
  99. if __name__ == "__main__":
  100. #ques = questionparser.get_questions()
  101. #with open("t.json","w+") as f:
  102. # f.write(json.dumps(ques))
  103. #docxconverter.docx2html()
  104. pass