Fix: [MinerU] Missing output file (#11623)

### What problem does this PR solve? Add fallbacks for MinerU output path. #11613, #11620. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
2025-12-06 07:19:03 +08:00 · 2025-12-01 12:17:43 +08:00
parent 9a8ce9d3e2
commit 9d0309aedc
2 changed files with 48 additions and 23 deletions
--- a/deepdoc/parser/mineru_parser.py
+++ b/deepdoc/parser/mineru_parser.py
@@ -190,7 +190,7 @@ class MinerUParser(RAGFlowPdfParser):
            self._run_mineru_executable(input_path, output_dir, method, backend, lang, server_url, callback)

    def _run_mineru_api(self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, callback: Optional[Callable] = None):
-        OUTPUT_ZIP_PATH = os.path.join(str(output_dir), "output.zip")
+        output_zip_path = os.path.join(str(output_dir), "output.zip")

        pdf_file_path = str(input_path)

@@ -230,16 +230,16 @@ class MinerUParser(RAGFlowPdfParser):

            response.raise_for_status()
            if response.headers.get("Content-Type") == "application/zip":
-                self.logger.info(f"[MinerU] zip file returned, saving to {OUTPUT_ZIP_PATH}...")
+                self.logger.info(f"[MinerU] zip file returned, saving to {output_zip_path}...")

                if callback:
-                    callback(0.30, f"[MinerU] zip file returned, saving to {OUTPUT_ZIP_PATH}...")
+                    callback(0.30, f"[MinerU] zip file returned, saving to {output_zip_path}...")

-                with open(OUTPUT_ZIP_PATH, "wb") as f:
+                with open(output_zip_path, "wb") as f:
                    f.write(response.content)

                self.logger.info(f"[MinerU] Unzip to {output_path}...")
-                self._extract_zip_no_root(OUTPUT_ZIP_PATH, output_path, pdf_file_name + "/")
+                self._extract_zip_no_root(output_zip_path, output_path, pdf_file_name + "/")

                if callback:
                    callback(0.40, f"[MinerU] Unzip to {output_path}...")
@@ -459,13 +459,36 @@ class MinerUParser(RAGFlowPdfParser):
        return poss

    def _read_output(self, output_dir: Path, file_stem: str, method: str = "auto", backend: str = "pipeline") -> list[dict[str, Any]]:
-        subdir = output_dir / file_stem / method
-        if backend.startswith("vlm-"):
-            subdir = output_dir / file_stem / "vlm"
-        json_file = subdir / f"{file_stem}_content_list.json"
+        candidates = []
+        seen = set()

-        if not json_file.exists():
-            raise FileNotFoundError(f"[MinerU] Missing output file: {json_file}")
+        def add_candidate_path(p: Path):
+            if p not in seen:
+                seen.add(p)
+                candidates.append(p)
+
+        if backend.startswith("vlm-"):
+            add_candidate_path(output_dir / file_stem / "vlm")
+            if method:
+                add_candidate_path(output_dir / file_stem / method)
+            add_candidate_path(output_dir / file_stem / "auto")
+        else:
+            if method:
+                add_candidate_path(output_dir / file_stem / method)
+            add_candidate_path(output_dir / file_stem / "vlm")
+            add_candidate_path(output_dir / file_stem / "auto")
+
+        json_file = None
+        subdir = None
+        for sub in candidates:
+            jf = sub / f"{file_stem}_content_list.json"
+            if jf.exists():
+                subdir = sub
+                json_file = jf
+                break
+
+        if not json_file:
+            raise FileNotFoundError(f"[MinerU] Missing output file, tried: {', '.join(str(c / (file_stem + '_content_list.json')) for c in candidates)}")

        with open(json_file, "r", encoding="utf-8") as f:
            data = json.load(f)
@@ -520,7 +543,7 @@ class MinerUParser(RAGFlowPdfParser):
        method: str = "auto",
        server_url: Optional[str] = None,
        delete_output: bool = True,
-        parse_method: str = "raw"
+        parse_method: str = "raw",
    ) -> tuple:
        import shutil

@@ -570,7 +593,7 @@ class MinerUParser(RAGFlowPdfParser):
            self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
            if callback:
                callback(0.75, f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
-                
+
            return self._transfer_to_sections(outputs, parse_method), self._transfer_to_tables(outputs)
        finally:
            if temp_pdf and temp_pdf.exists():
--- a/rag/llm/chat_model.py
+++ b/rag/llm/chat_model.py
@@ -33,9 +33,9 @@ from openai.lib.azure import AzureOpenAI
 from strenum import StrEnum
 from zhipuai import ZhipuAI

+from common.token_utils import num_tokens_from_string, total_token_count_from_response
 from rag.llm import FACTORY_DEFAULT_BASE_URL, LITELLM_PROVIDER_PREFIX, SupportedLiteLLMProvider
 from rag.nlp import is_chinese, is_english
-from common.token_utils import num_tokens_from_string, total_token_count_from_response


 # Error message constants
@@ -66,7 +66,7 @@ LENGTH_NOTIFICATION_EN = "...\nThe answer is truncated by your chosen LLM due to

 class Base(ABC):
    def __init__(self, key, model_name, base_url, **kwargs):
-        timeout = int(os.environ.get("LM_TIMEOUT_SECONDS", 600))
+        timeout = int(os.environ.get("LLM_TIMEOUT_SECONDS", 600))
        self.client = OpenAI(api_key=key, base_url=base_url, timeout=timeout)
        self.model_name = model_name
        # Configure retry parameters
@@ -127,7 +127,7 @@ class Base(ABC):
            "tool_choice",
            "logprobs",
            "top_logprobs",
-            "extra_headers"
+            "extra_headers",
        }

        gen_conf = {k: v for k, v in gen_conf.items() if k in allowed_conf}
@@ -1213,7 +1213,7 @@ class GoogleChat(Base):

        # Build GenerateContentConfig
        try:
-            from google.genai.types import GenerateContentConfig, ThinkingConfig, Content, Part
+            from google.genai.types import Content, GenerateContentConfig, Part, ThinkingConfig
        except ImportError as e:
            logging.error(f"[GoogleChat] Failed to import google-genai: {e}. Please install: pip install google-genai>=1.41.0")
            raise
@@ -1242,14 +1242,14 @@ class GoogleChat(Base):
            role = "model" if item["role"] == "assistant" else item["role"]
            content = Content(
                role=role,
-                parts=[Part(text=item["content"])]
+                parts=[Part(text=item["content"])],
            )
            contents.append(content)

        response = self.client.models.generate_content(
            model=self.model_name,
            contents=contents,
-            config=config
+            config=config,
        )

        ans = response.text
@@ -1299,7 +1299,7 @@ class GoogleChat(Base):

            # Build GenerateContentConfig
            try:
-                from google.genai.types import GenerateContentConfig, ThinkingConfig, Content, Part
+                from google.genai.types import Content, GenerateContentConfig, Part, ThinkingConfig
            except ImportError as e:
                logging.error(f"[GoogleChat] Failed to import google-genai: {e}. Please install: pip install google-genai>=1.41.0")
                raise
@@ -1326,7 +1326,7 @@ class GoogleChat(Base):
                role = "model" if item["role"] == "assistant" else item["role"]
                content = Content(
                    role=role,
-                    parts=[Part(text=item["content"])]
+                    parts=[Part(text=item["content"])],
                )
                contents.append(content)

@@ -1334,7 +1334,7 @@ class GoogleChat(Base):
                for chunk in self.client.models.generate_content_stream(
                    model=self.model_name,
                    contents=contents,
-                    config=config
+                    config=config,
                ):
                    text = chunk.text
                    ans = text
@@ -1406,7 +1406,7 @@ class LiteLLMBase(ABC):
    ]

    def __init__(self, key, model_name, base_url=None, **kwargs):
-        self.timeout = int(os.environ.get("LM_TIMEOUT_SECONDS", 600))
+        self.timeout = int(os.environ.get("LLM_TIMEOUT_SECONDS", 600))
        self.provider = kwargs.get("provider", "")
        self.prefix = LITELLM_PROVIDER_PREFIX.get(self.provider, "")
        self.model_name = f"{self.prefix}{model_name}"
@@ -1625,6 +1625,7 @@ class LiteLLMBase(ABC):

        if self.provider == SupportedLiteLLMProvider.OpenRouter:
            if self.provider_order:
+
                def _to_order_list(x):
                    if x is None:
                        return []
@@ -1633,6 +1634,7 @@ class LiteLLMBase(ABC):
                    if isinstance(x, (list, tuple)):
                        return [str(s).strip() for s in x if str(s).strip()]
                    return []
+
                extra_body = {}
                provider_cfg = {}
                provider_order = _to_order_list(self.provider_order)