Fix: [MinerU] Missing output file (#11623)

### What problem does this PR solve?

Add fallbacks for MinerU output path. #11613, #11620.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Yongteng Lei
2025-12-01 12:17:43 +08:00
committed by GitHub
parent 9a8ce9d3e2
commit 9d0309aedc
2 changed files with 48 additions and 23 deletions

View File

@@ -190,7 +190,7 @@ class MinerUParser(RAGFlowPdfParser):
self._run_mineru_executable(input_path, output_dir, method, backend, lang, server_url, callback)
def _run_mineru_api(self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, callback: Optional[Callable] = None):
OUTPUT_ZIP_PATH = os.path.join(str(output_dir), "output.zip")
output_zip_path = os.path.join(str(output_dir), "output.zip")
pdf_file_path = str(input_path)
@@ -230,16 +230,16 @@ class MinerUParser(RAGFlowPdfParser):
response.raise_for_status()
if response.headers.get("Content-Type") == "application/zip":
self.logger.info(f"[MinerU] zip file returned, saving to {OUTPUT_ZIP_PATH}...")
self.logger.info(f"[MinerU] zip file returned, saving to {output_zip_path}...")
if callback:
callback(0.30, f"[MinerU] zip file returned, saving to {OUTPUT_ZIP_PATH}...")
callback(0.30, f"[MinerU] zip file returned, saving to {output_zip_path}...")
with open(OUTPUT_ZIP_PATH, "wb") as f:
with open(output_zip_path, "wb") as f:
f.write(response.content)
self.logger.info(f"[MinerU] Unzip to {output_path}...")
self._extract_zip_no_root(OUTPUT_ZIP_PATH, output_path, pdf_file_name + "/")
self._extract_zip_no_root(output_zip_path, output_path, pdf_file_name + "/")
if callback:
callback(0.40, f"[MinerU] Unzip to {output_path}...")
@@ -459,13 +459,36 @@ class MinerUParser(RAGFlowPdfParser):
return poss
def _read_output(self, output_dir: Path, file_stem: str, method: str = "auto", backend: str = "pipeline") -> list[dict[str, Any]]:
subdir = output_dir / file_stem / method
if backend.startswith("vlm-"):
subdir = output_dir / file_stem / "vlm"
json_file = subdir / f"{file_stem}_content_list.json"
candidates = []
seen = set()
if not json_file.exists():
raise FileNotFoundError(f"[MinerU] Missing output file: {json_file}")
def add_candidate_path(p: Path):
if p not in seen:
seen.add(p)
candidates.append(p)
if backend.startswith("vlm-"):
add_candidate_path(output_dir / file_stem / "vlm")
if method:
add_candidate_path(output_dir / file_stem / method)
add_candidate_path(output_dir / file_stem / "auto")
else:
if method:
add_candidate_path(output_dir / file_stem / method)
add_candidate_path(output_dir / file_stem / "vlm")
add_candidate_path(output_dir / file_stem / "auto")
json_file = None
subdir = None
for sub in candidates:
jf = sub / f"{file_stem}_content_list.json"
if jf.exists():
subdir = sub
json_file = jf
break
if not json_file:
raise FileNotFoundError(f"[MinerU] Missing output file, tried: {', '.join(str(c / (file_stem + '_content_list.json')) for c in candidates)}")
with open(json_file, "r", encoding="utf-8") as f:
data = json.load(f)
@@ -520,7 +543,7 @@ class MinerUParser(RAGFlowPdfParser):
method: str = "auto",
server_url: Optional[str] = None,
delete_output: bool = True,
parse_method: str = "raw"
parse_method: str = "raw",
) -> tuple:
import shutil
@@ -570,7 +593,7 @@ class MinerUParser(RAGFlowPdfParser):
self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
if callback:
callback(0.75, f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
return self._transfer_to_sections(outputs, parse_method), self._transfer_to_tables(outputs)
finally:
if temp_pdf and temp_pdf.exists():

View File

@@ -33,9 +33,9 @@ from openai.lib.azure import AzureOpenAI
from strenum import StrEnum
from zhipuai import ZhipuAI
from common.token_utils import num_tokens_from_string, total_token_count_from_response
from rag.llm import FACTORY_DEFAULT_BASE_URL, LITELLM_PROVIDER_PREFIX, SupportedLiteLLMProvider
from rag.nlp import is_chinese, is_english
from common.token_utils import num_tokens_from_string, total_token_count_from_response
# Error message constants
@@ -66,7 +66,7 @@ LENGTH_NOTIFICATION_EN = "...\nThe answer is truncated by your chosen LLM due to
class Base(ABC):
def __init__(self, key, model_name, base_url, **kwargs):
timeout = int(os.environ.get("LM_TIMEOUT_SECONDS", 600))
timeout = int(os.environ.get("LLM_TIMEOUT_SECONDS", 600))
self.client = OpenAI(api_key=key, base_url=base_url, timeout=timeout)
self.model_name = model_name
# Configure retry parameters
@@ -127,7 +127,7 @@ class Base(ABC):
"tool_choice",
"logprobs",
"top_logprobs",
"extra_headers"
"extra_headers",
}
gen_conf = {k: v for k, v in gen_conf.items() if k in allowed_conf}
@@ -1213,7 +1213,7 @@ class GoogleChat(Base):
# Build GenerateContentConfig
try:
from google.genai.types import GenerateContentConfig, ThinkingConfig, Content, Part
from google.genai.types import Content, GenerateContentConfig, Part, ThinkingConfig
except ImportError as e:
logging.error(f"[GoogleChat] Failed to import google-genai: {e}. Please install: pip install google-genai>=1.41.0")
raise
@@ -1242,14 +1242,14 @@ class GoogleChat(Base):
role = "model" if item["role"] == "assistant" else item["role"]
content = Content(
role=role,
parts=[Part(text=item["content"])]
parts=[Part(text=item["content"])],
)
contents.append(content)
response = self.client.models.generate_content(
model=self.model_name,
contents=contents,
config=config
config=config,
)
ans = response.text
@@ -1299,7 +1299,7 @@ class GoogleChat(Base):
# Build GenerateContentConfig
try:
from google.genai.types import GenerateContentConfig, ThinkingConfig, Content, Part
from google.genai.types import Content, GenerateContentConfig, Part, ThinkingConfig
except ImportError as e:
logging.error(f"[GoogleChat] Failed to import google-genai: {e}. Please install: pip install google-genai>=1.41.0")
raise
@@ -1326,7 +1326,7 @@ class GoogleChat(Base):
role = "model" if item["role"] == "assistant" else item["role"]
content = Content(
role=role,
parts=[Part(text=item["content"])]
parts=[Part(text=item["content"])],
)
contents.append(content)
@@ -1334,7 +1334,7 @@ class GoogleChat(Base):
for chunk in self.client.models.generate_content_stream(
model=self.model_name,
contents=contents,
config=config
config=config,
):
text = chunk.text
ans = text
@@ -1406,7 +1406,7 @@ class LiteLLMBase(ABC):
]
def __init__(self, key, model_name, base_url=None, **kwargs):
self.timeout = int(os.environ.get("LM_TIMEOUT_SECONDS", 600))
self.timeout = int(os.environ.get("LLM_TIMEOUT_SECONDS", 600))
self.provider = kwargs.get("provider", "")
self.prefix = LITELLM_PROVIDER_PREFIX.get(self.provider, "")
self.model_name = f"{self.prefix}{model_name}"
@@ -1625,6 +1625,7 @@ class LiteLLMBase(ABC):
if self.provider == SupportedLiteLLMProvider.OpenRouter:
if self.provider_order:
def _to_order_list(x):
if x is None:
return []
@@ -1633,6 +1634,7 @@ class LiteLLMBase(ABC):
if isinstance(x, (list, tuple)):
return [str(s).strip() for s in x if str(s).strip()]
return []
extra_body = {}
provider_cfg = {}
provider_order = _to_order_list(self.provider_order)