docx_processor.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430
  1. """
  2. Word文档处理模块
  3. 提供Word文档的处理和变量替换功能
  4. """
  5. import os
  6. import time
  7. from docx import Document
  8. from docx.oxml.ns import qn # 导入qn函数用于处理中文字体
  9. from docx.shared import Inches, Cm # 导入Inches和Cm用于设置图片尺寸
  10. from docx.enum.section import WD_SECTION_START # 导入WD_SECTION_START用于添加新页面
  11. from .logger import logger
  12. def check_variables_in_document(docx_path, variables):
  13. """
  14. 检查文档中是否包含指定的变量,并输出详细信息用于调试
  15. 参数:
  16. docx_path: Word文档路径
  17. variables: 要检查的变量字典 {变量名: 替换值}
  18. """
  19. logger.debug(f"检查文档变量: {docx_path}")
  20. doc = Document(docx_path)
  21. # 收集文档中的所有文本
  22. all_text = []
  23. # 检查段落中的变量(简化输出)
  24. for i, paragraph in enumerate(doc.paragraphs):
  25. all_text.append(paragraph.text)
  26. if paragraph.text.strip(): # 只记录非空段落
  27. logger.debug(f"段落 {i}: {paragraph.text}")
  28. # 检查表格中的变量(简化输出)
  29. for t_idx, table in enumerate(doc.tables):
  30. for r_idx, row in enumerate(table.rows):
  31. for c_idx, cell in enumerate(row.cells):
  32. for p_idx, paragraph in enumerate(cell.paragraphs):
  33. all_text.append(paragraph.text)
  34. if paragraph.text.strip(): # 只记录非空段落
  35. logger.debug(f"表格 {t_idx}, 行 {r_idx}, 列 {c_idx}, 段落 {p_idx}: {paragraph.text}")
  36. # 检查是否找到所有变量
  37. for var_name in variables.keys():
  38. found = False
  39. for text in all_text:
  40. if var_name in text:
  41. found = True
  42. logger.info(f"变量 '{var_name}' 在文档中找到!")
  43. break
  44. if not found:
  45. logger.warning(f"变量 '{var_name}' 在文档中未找到! 将被替换为空字符串。")
  46. # 检查文档中可能存在但未提供的变量
  47. potential_vars = set()
  48. for text in all_text:
  49. # 查找形如 {xxx} 的模式
  50. start = 0
  51. while True:
  52. start = text.find('{', start)
  53. if start == -1:
  54. break
  55. end = text.find('}', start)
  56. if end == -1:
  57. break
  58. potential_var = text[start:end+1]
  59. potential_vars.add(potential_var)
  60. start = end + 1
  61. # 检查是否有未提供的变量
  62. for var in potential_vars:
  63. if var not in variables:
  64. logger.warning(f"文档中存在变量 '{var}',但未提供替换值!")
  65. def verify_replacement(docx_path, variables):
  66. """
  67. 验证变量是否已被成功替换
  68. 参数:
  69. docx_path: 处理后的Word文档路径
  70. variables: 要验证的变量字典 {变量名: 替换值}
  71. """
  72. logger.debug(f"验证变量替换: {docx_path}")
  73. doc = Document(docx_path)
  74. # 收集文档中的所有文本
  75. all_text = []
  76. # 检查段落
  77. for paragraph in doc.paragraphs:
  78. all_text.append(paragraph.text)
  79. # 检查表格
  80. for table in doc.tables:
  81. for row in table.rows:
  82. for cell in row.cells:
  83. for paragraph in cell.paragraphs:
  84. all_text.append(paragraph.text)
  85. # 检查是否所有变量都已被替换
  86. replacement_failed = False
  87. for var_name in variables.keys():
  88. for text in all_text:
  89. if var_name in text:
  90. logger.warning(f"变量 '{var_name}' 在处理后的文档中仍然存在! 替换可能失败。")
  91. replacement_failed = True
  92. break
  93. if not replacement_failed:
  94. logger.info("所有变量都已成功替换")
  95. def replace_text_in_paragraph(paragraph, variables):
  96. """
  97. 在段落中替换变量,同时保留文本格式
  98. 此方法通过以下步骤工作:
  99. 1. 收集段落中的所有runs(文本片段)
  100. 2. 清空段落
  101. 3. 处理每个run中的变量
  102. 4. 创建新的run,保留原始格式
  103. 5. 将处理后的文本添加回段落
  104. 参数:
  105. paragraph: 要处理的段落对象
  106. variables: 变量替换字典 {变量名: 替换值}
  107. """
  108. # 检查段落是否包含任何变量
  109. contains_variable = False
  110. found_variables = []
  111. # 记录原始段落文本
  112. original_text = paragraph.text
  113. for var_name in variables.keys():
  114. if var_name in original_text:
  115. contains_variable = True
  116. found_variables.append(var_name)
  117. if not contains_variable:
  118. return
  119. logger.info(f"在段落中找到变量: {found_variables}")
  120. logger.info(f"原始段落文本: {original_text}")
  121. # 存储原始的运行对象
  122. runs = [run for run in paragraph.runs]
  123. paragraph.clear()
  124. # 对每个运行进行处理
  125. for i, run in enumerate(runs):
  126. text = run.text
  127. original_run_text = text
  128. # 替换所有变量
  129. for var_name, var_value in variables.items():
  130. if var_name in text:
  131. logger.info(f"在run {i}中替换变量 '{var_name}' 为 '{var_value}'")
  132. logger.info(f"替换前文本: '{text}'")
  133. text = text.replace(var_name, var_value)
  134. logger.info(f"替换后文本: '{text}'")
  135. # 如果文本没有变化,记录一下
  136. if original_run_text == text:
  137. logger.debug(f"Run {i} 文本未变化: '{text}'")
  138. # 创建新的运行,保留原始格式
  139. new_run = paragraph.add_run(text)
  140. # 复制格式
  141. new_run.bold = run.bold
  142. new_run.italic = run.italic
  143. new_run.underline = run.underline
  144. new_run.font.name = run.font.name
  145. new_run.font.size = run.font.size
  146. # 专门处理中文字体,确保东亚字体(如仿宋、宋体等)能够正确保留
  147. if hasattr(run._element, 'rPr') and run._element.rPr is not None:
  148. # 检查是否有rFonts元素
  149. rfonts = run._element.rPr.xpath('./w:rFonts')
  150. if rfonts and hasattr(rfonts[0], 'get'):
  151. # 获取东亚字体属性
  152. east_asia_font = rfonts[0].get(qn('w:eastAsia'))
  153. if east_asia_font:
  154. # 设置新run的东亚字体
  155. new_run._element.rPr.rFonts.set(qn('w:eastAsia'), east_asia_font)
  156. logger.debug(f"设置东亚字体: {east_asia_font}")
  157. if run.font.color.rgb is not None:
  158. new_run.font.color.rgb = run.font.color.rgb
  159. # 记录处理后的段落文本
  160. logger.info(f"处理后段落文本: {paragraph.text}")
  161. def add_image_page(document, image_path):
  162. """
  163. 在文档末尾添加新页面并插入图片
  164. 参数:
  165. document: 文档对象
  166. image_path: 图片路径
  167. 返回:
  168. 成功返回True,失败返回False
  169. """
  170. try:
  171. logger.info(f"开始添加图片页面: {image_path}")
  172. if not os.path.exists(image_path):
  173. logger.error(f"图片文件不存在: {image_path}")
  174. return False
  175. # 添加分节符,创建新页
  176. current_section = document.sections[-1]
  177. new_section = document.add_section(WD_SECTION_START.NEW_PAGE)
  178. # 复制前一节的页面设置
  179. new_section.page_height = current_section.page_height
  180. new_section.page_width = current_section.page_width
  181. new_section.left_margin = current_section.left_margin
  182. new_section.right_margin = current_section.right_margin
  183. new_section.top_margin = current_section.top_margin
  184. new_section.bottom_margin = current_section.bottom_margin
  185. new_section.header_distance = current_section.header_distance
  186. new_section.footer_distance = current_section.footer_distance
  187. # 添加空段落
  188. document.add_paragraph()
  189. # 计算合适的图片尺寸
  190. width_inches = (new_section.page_width.inches -
  191. new_section.left_margin.inches -
  192. new_section.right_margin.inches) * 1
  193. # 添加图片到文档,设置宽度
  194. try:
  195. document.add_picture(image_path, width=Inches(width_inches))
  196. logger.info(f"成功添加图片: {os.path.basename(image_path)}")
  197. return True
  198. except Exception as e:
  199. logger.error(f"添加图片失败: {str(e)}")
  200. return False
  201. except Exception as e:
  202. logger.error(f"添加图片页面时出错: {str(e)}")
  203. return False
  204. def process_word_template(template_path, output_path, variables):
  205. """
  206. 处理Word文档,替换其中的模板变量
  207. 参数:
  208. template_path: 模板文档路径
  209. output_path: 输出文档路径
  210. variables: 变量替换字典 {变量名: 替换值}
  211. """
  212. # 记录处理开始
  213. start_time = time.time()
  214. logger.info(f"开始处理文档: {os.path.basename(template_path)}")
  215. logger.info(f"需要替换的变量: {list(variables.keys())}")
  216. # 只处理docx文件
  217. if not template_path.lower().endswith('.docx'):
  218. logger.error("只支持.docx格式的文件")
  219. raise ValueError("只支持.docx格式的文件")
  220. doc = Document(template_path)
  221. # 统计替换次数
  222. replacement_count = 0
  223. # 处理段落中的变量
  224. logger.info("开始处理文档段落...")
  225. paragraph_count = 0
  226. for i, paragraph in enumerate(doc.paragraphs):
  227. has_var = any(var_name in paragraph.text for var_name in variables.keys())
  228. if has_var:
  229. paragraph_count += 1
  230. replace_text_in_paragraph_improved(paragraph, variables)
  231. replacement_count += 1
  232. # 处理表格中的变量
  233. logger.info("开始处理文档表格...")
  234. table_cell_count = 0
  235. for t_idx, table in enumerate(doc.tables):
  236. for r_idx, row in enumerate(table.rows):
  237. for c_idx, cell in enumerate(row.cells):
  238. for p_idx, paragraph in enumerate(cell.paragraphs):
  239. has_var = any(var_name in paragraph.text for var_name in variables.keys())
  240. if has_var:
  241. table_cell_count += 1
  242. replace_text_in_paragraph_improved(paragraph, variables)
  243. replacement_count += 1
  244. logger.info(f"处理完成: 共处理了 {paragraph_count} 个段落和 {table_cell_count} 个表格单元格")
  245. # 检查是否需要添加图片
  246. image_path = os.path.join('image', 'test.jpeg')
  247. if os.path.exists(image_path):
  248. logger.info(f"发现图片文件: {image_path},准备添加图片页面")
  249. add_image_page(doc, image_path)
  250. # 保存生成的文档
  251. try:
  252. doc.save(output_path)
  253. logger.info(f"文档已保存: {os.path.basename(output_path)}")
  254. except PermissionError:
  255. # 如果文件被占用,尝试使用新的文件名
  256. dir_name = os.path.dirname(output_path)
  257. base_name = os.path.basename(output_path)
  258. new_output_path = os.path.join(dir_name, f"new_{base_name}")
  259. logger.warning(f"文件被占用,尝试保存到新位置: {os.path.basename(new_output_path)}")
  260. doc.save(new_output_path)
  261. # 重命名原始路径,以便后续代码能正确引用
  262. os.rename(new_output_path, output_path)
  263. logger.info(f"文件已重命名为: {os.path.basename(output_path)}")
  264. # 记录处理时间
  265. process_time = time.time() - start_time
  266. logger.info(f"文档处理完成,耗时: {process_time:.2f}秒")
  267. def replace_text_in_paragraph_improved(paragraph, variables):
  268. """
  269. 改进的段落变量替换方法,处理变量可能被分割在多个runs的情况
  270. 参数:
  271. paragraph: 要处理的段落对象
  272. variables: 变量替换字典 {变量名: 替换值}
  273. """
  274. # 记录原始段落文本
  275. original_text = paragraph.text
  276. # 检查段落是否包含任何变量
  277. found_variables = []
  278. for var_name in variables.keys():
  279. if var_name in original_text:
  280. found_variables.append(var_name)
  281. if not found_variables:
  282. return
  283. logger.info(f"发现需要替换的变量: {found_variables}")
  284. logger.info(f"原始文本: {original_text}")
  285. # 尝试使用原始的替换方法
  286. try:
  287. # 先尝试使用原始替换方法
  288. replace_text_in_paragraph(paragraph, variables)
  289. # 检查是否所有变量都被替换
  290. all_replaced = True
  291. for var_name in found_variables:
  292. if var_name in paragraph.text:
  293. all_replaced = False
  294. logger.warning(f"变量 '{var_name}' 未被替换,尝试使用备用方法")
  295. break
  296. if all_replaced:
  297. logger.info("所有变量已成功替换")
  298. return
  299. except Exception as e:
  300. logger.warning(f"原始替换方法失败: {str(e)},尝试使用备用方法")
  301. # 如果原始方法失败或未替换所有变量,使用备用方法
  302. logger.info("使用备用替换方法")
  303. # 记录原始格式信息
  304. original_runs = []
  305. for i, run in enumerate(paragraph.runs):
  306. original_runs.append({
  307. 'text': run.text,
  308. 'bold': run.bold,
  309. 'italic': run.italic,
  310. 'underline': run.underline,
  311. 'font_name': run.font.name,
  312. 'font_size': run.font.size,
  313. 'font_color': run.font.color.rgb,
  314. 'east_asia_font': None
  315. })
  316. # 获取东亚字体信息
  317. if hasattr(run._element, 'rPr') and run._element.rPr is not None:
  318. rfonts = run._element.rPr.xpath('./w:rFonts')
  319. if rfonts and hasattr(rfonts[0], 'get'):
  320. east_asia_font = rfonts[0].get(qn('w:eastAsia'))
  321. if east_asia_font:
  322. original_runs[-1]['east_asia_font'] = east_asia_font
  323. # 创建一个新的段落文本,替换所有变量
  324. new_text = original_text
  325. for var_name, var_value in variables.items():
  326. if var_name in new_text:
  327. logger.info(f"替换变量 '{var_name}' 为 '{var_value}'")
  328. new_text = new_text.replace(var_name, var_value)
  329. # 如果文本没有变化,不需要进一步处理
  330. if new_text == original_text:
  331. logger.info("文本未变化,跳过处理")
  332. return
  333. # 记录处理后的段落文本
  334. logger.info(f"替换后文本: {new_text}")
  335. # 清空段落
  336. paragraph.clear()
  337. # 使用最简单的方法:使用第一个run的格式,但不应用下划线
  338. # 这样可以确保文本不会全部带有下划线
  339. default_format = original_runs[0] if original_runs else None
  340. if default_format:
  341. new_run = paragraph.add_run(new_text)
  342. new_run.bold = default_format['bold']
  343. new_run.italic = default_format['italic']
  344. new_run.underline = False # 明确设置为不使用下划线
  345. new_run.font.name = default_format['font_name']
  346. if default_format['font_size'] is not None:
  347. new_run.font.size = default_format['font_size']
  348. if default_format['font_color'] is not None:
  349. new_run.font.color.rgb = default_format['font_color']
  350. # 设置东亚字体
  351. if default_format['east_asia_font']:
  352. new_run._element.rPr.rFonts.set(qn('w:eastAsia'), default_format['east_asia_font'])
  353. else:
  354. # 如果没有原始格式信息,直接添加文本
  355. paragraph.add_run(new_text)