pdf_to_md.py 2.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172
  1. #!/usr/bin/env python3
  2. """PDF 转 Markdown 工具(保留 LaTeX 公式)"""
  3. import os
  4. import re
  5. import subprocess
  6. from pathlib import Path
  7. def extract_latex_from_text(text: str) -> str:
  8. """尝试恢复 LaTeX 公式(简单处理)"""
  9. # 这里可以根据具体 PDF 内容优化
  10. return text
  11. def pdf_to_markdown(pdf_path: str, output_path: str) -> bool:
  12. """将 PDF 转换为 Markdown"""
  13. pdf_path = Path(pdf_path)
  14. output_path = Path(output_path)
  15. if not pdf_path.exists():
  16. print(f"❌ PDF 不存在:{pdf_path}")
  17. return False
  18. output_path.parent.mkdir(parents=True, exist_ok=True)
  19. # 尝试多种工具
  20. tools = [
  21. ("pdftotext", lambda p, o: subprocess.run(
  22. ["pdftotext", "-layout", str(p), str(o)],
  23. capture_output=True
  24. )),
  25. ("pdftoppm + tesseract", None), # 需要额外处理
  26. ]
  27. for tool_name, func in tools:
  28. if func is None:
  29. continue
  30. print(f"🔧 尝试使用 {tool_name}...")
  31. result = func(pdf_path, output_path)
  32. if result.returncode == 0:
  33. # 读取并处理
  34. txt_path = output_path.with_suffix(".txt")
  35. if txt_path.exists():
  36. with open(txt_path, "r", encoding="utf-8", errors="ignore") as f:
  37. content = f.read()
  38. # 转换为 Markdown
  39. md_content = extract_latex_from_text(content)
  40. md_path = output_path.with_suffix(".md")
  41. with open(md_path, "w", encoding="utf-8") as f:
  42. f.write(md_content)
  43. print(f"✅ 转换完成:{md_path}")
  44. return True
  45. print("❌ 所有工具都失败")
  46. return False
  47. def main():
  48. import argparse
  49. parser = argparse.ArgumentParser(description="PDF 转 Markdown")
  50. parser.add_argument("--input", required=True, help="输入 PDF 路径")
  51. parser.add_argument("--output", required=True, help="输出 Markdown 路径")
  52. args = parser.parse_args()
  53. success = pdf_to_markdown(args.input, args.output)
  54. sys.exit(0 if success else 1)
  55. if __name__ == "__main__":
  56. main()