#!/usr/bin/env python3 """PDF 转 Markdown 工具(保留 LaTeX 公式)""" import os import re import subprocess from pathlib import Path def extract_latex_from_text(text: str) -> str: """尝试恢复 LaTeX 公式(简单处理)""" # 这里可以根据具体 PDF 内容优化 return text def pdf_to_markdown(pdf_path: str, output_path: str) -> bool: """将 PDF 转换为 Markdown""" pdf_path = Path(pdf_path) output_path = Path(output_path) if not pdf_path.exists(): print(f"❌ PDF 不存在:{pdf_path}") return False output_path.parent.mkdir(parents=True, exist_ok=True) # 尝试多种工具 tools = [ ("pdftotext", lambda p, o: subprocess.run( ["pdftotext", "-layout", str(p), str(o)], capture_output=True )), ("pdftoppm + tesseract", None), # 需要额外处理 ] for tool_name, func in tools: if func is None: continue print(f"🔧 尝试使用 {tool_name}...") result = func(pdf_path, output_path) if result.returncode == 0: # 读取并处理 txt_path = output_path.with_suffix(".txt") if txt_path.exists(): with open(txt_path, "r", encoding="utf-8", errors="ignore") as f: content = f.read() # 转换为 Markdown md_content = extract_latex_from_text(content) md_path = output_path.with_suffix(".md") with open(md_path, "w", encoding="utf-8") as f: f.write(md_content) print(f"✅ 转换完成:{md_path}") return True print("❌ 所有工具都失败") return False def main(): import argparse parser = argparse.ArgumentParser(description="PDF 转 Markdown") parser.add_argument("--input", required=True, help="输入 PDF 路径") parser.add_argument("--output", required=True, help="输出 Markdown 路径") args = parser.parse_args() success = pdf_to_markdown(args.input, args.output) sys.exit(0 if success else 1) if __name__ == "__main__": main()