| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172 |
- #!/usr/bin/env python3
- """PDF 转 Markdown 工具(保留 LaTeX 公式)"""
- import os
- import re
- import subprocess
- from pathlib import Path
- def extract_latex_from_text(text: str) -> str:
- """尝试恢复 LaTeX 公式(简单处理)"""
- # 这里可以根据具体 PDF 内容优化
- return text
- def pdf_to_markdown(pdf_path: str, output_path: str) -> bool:
- """将 PDF 转换为 Markdown"""
- pdf_path = Path(pdf_path)
- output_path = Path(output_path)
-
- if not pdf_path.exists():
- print(f"❌ PDF 不存在:{pdf_path}")
- return False
-
- output_path.parent.mkdir(parents=True, exist_ok=True)
-
- # 尝试多种工具
- tools = [
- ("pdftotext", lambda p, o: subprocess.run(
- ["pdftotext", "-layout", str(p), str(o)],
- capture_output=True
- )),
- ("pdftoppm + tesseract", None), # 需要额外处理
- ]
-
- for tool_name, func in tools:
- if func is None:
- continue
-
- print(f"🔧 尝试使用 {tool_name}...")
- result = func(pdf_path, output_path)
-
- if result.returncode == 0:
- # 读取并处理
- txt_path = output_path.with_suffix(".txt")
- if txt_path.exists():
- with open(txt_path, "r", encoding="utf-8", errors="ignore") as f:
- content = f.read()
-
- # 转换为 Markdown
- md_content = extract_latex_from_text(content)
-
- md_path = output_path.with_suffix(".md")
- with open(md_path, "w", encoding="utf-8") as f:
- f.write(md_content)
-
- print(f"✅ 转换完成:{md_path}")
- return True
-
- print("❌ 所有工具都失败")
- return False
- def main():
- import argparse
- parser = argparse.ArgumentParser(description="PDF 转 Markdown")
- parser.add_argument("--input", required=True, help="输入 PDF 路径")
- parser.add_argument("--output", required=True, help="输出 Markdown 路径")
- args = parser.parse_args()
-
- success = pdf_to_markdown(args.input, args.output)
- sys.exit(0 if success else 1)
- if __name__ == "__main__":
- main()
|