mirror of
https://gitee.com/infiniflow/ragflow.git
synced 2025-12-06 07:19:03 +08:00
Fix: [MinerU] Missing output file (#11623)
### What problem does this PR solve? Add fallbacks for MinerU output path. #11613, #11620. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
@@ -190,7 +190,7 @@ class MinerUParser(RAGFlowPdfParser):
|
||||
self._run_mineru_executable(input_path, output_dir, method, backend, lang, server_url, callback)
|
||||
|
||||
def _run_mineru_api(self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, callback: Optional[Callable] = None):
|
||||
OUTPUT_ZIP_PATH = os.path.join(str(output_dir), "output.zip")
|
||||
output_zip_path = os.path.join(str(output_dir), "output.zip")
|
||||
|
||||
pdf_file_path = str(input_path)
|
||||
|
||||
@@ -230,16 +230,16 @@ class MinerUParser(RAGFlowPdfParser):
|
||||
|
||||
response.raise_for_status()
|
||||
if response.headers.get("Content-Type") == "application/zip":
|
||||
self.logger.info(f"[MinerU] zip file returned, saving to {OUTPUT_ZIP_PATH}...")
|
||||
self.logger.info(f"[MinerU] zip file returned, saving to {output_zip_path}...")
|
||||
|
||||
if callback:
|
||||
callback(0.30, f"[MinerU] zip file returned, saving to {OUTPUT_ZIP_PATH}...")
|
||||
callback(0.30, f"[MinerU] zip file returned, saving to {output_zip_path}...")
|
||||
|
||||
with open(OUTPUT_ZIP_PATH, "wb") as f:
|
||||
with open(output_zip_path, "wb") as f:
|
||||
f.write(response.content)
|
||||
|
||||
self.logger.info(f"[MinerU] Unzip to {output_path}...")
|
||||
self._extract_zip_no_root(OUTPUT_ZIP_PATH, output_path, pdf_file_name + "/")
|
||||
self._extract_zip_no_root(output_zip_path, output_path, pdf_file_name + "/")
|
||||
|
||||
if callback:
|
||||
callback(0.40, f"[MinerU] Unzip to {output_path}...")
|
||||
@@ -459,13 +459,36 @@ class MinerUParser(RAGFlowPdfParser):
|
||||
return poss
|
||||
|
||||
def _read_output(self, output_dir: Path, file_stem: str, method: str = "auto", backend: str = "pipeline") -> list[dict[str, Any]]:
|
||||
subdir = output_dir / file_stem / method
|
||||
if backend.startswith("vlm-"):
|
||||
subdir = output_dir / file_stem / "vlm"
|
||||
json_file = subdir / f"{file_stem}_content_list.json"
|
||||
candidates = []
|
||||
seen = set()
|
||||
|
||||
if not json_file.exists():
|
||||
raise FileNotFoundError(f"[MinerU] Missing output file: {json_file}")
|
||||
def add_candidate_path(p: Path):
|
||||
if p not in seen:
|
||||
seen.add(p)
|
||||
candidates.append(p)
|
||||
|
||||
if backend.startswith("vlm-"):
|
||||
add_candidate_path(output_dir / file_stem / "vlm")
|
||||
if method:
|
||||
add_candidate_path(output_dir / file_stem / method)
|
||||
add_candidate_path(output_dir / file_stem / "auto")
|
||||
else:
|
||||
if method:
|
||||
add_candidate_path(output_dir / file_stem / method)
|
||||
add_candidate_path(output_dir / file_stem / "vlm")
|
||||
add_candidate_path(output_dir / file_stem / "auto")
|
||||
|
||||
json_file = None
|
||||
subdir = None
|
||||
for sub in candidates:
|
||||
jf = sub / f"{file_stem}_content_list.json"
|
||||
if jf.exists():
|
||||
subdir = sub
|
||||
json_file = jf
|
||||
break
|
||||
|
||||
if not json_file:
|
||||
raise FileNotFoundError(f"[MinerU] Missing output file, tried: {', '.join(str(c / (file_stem + '_content_list.json')) for c in candidates)}")
|
||||
|
||||
with open(json_file, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
@@ -520,7 +543,7 @@ class MinerUParser(RAGFlowPdfParser):
|
||||
method: str = "auto",
|
||||
server_url: Optional[str] = None,
|
||||
delete_output: bool = True,
|
||||
parse_method: str = "raw"
|
||||
parse_method: str = "raw",
|
||||
) -> tuple:
|
||||
import shutil
|
||||
|
||||
@@ -570,7 +593,7 @@ class MinerUParser(RAGFlowPdfParser):
|
||||
self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
|
||||
if callback:
|
||||
callback(0.75, f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
|
||||
|
||||
|
||||
return self._transfer_to_sections(outputs, parse_method), self._transfer_to_tables(outputs)
|
||||
finally:
|
||||
if temp_pdf and temp_pdf.exists():
|
||||
|
||||
@@ -33,9 +33,9 @@ from openai.lib.azure import AzureOpenAI
|
||||
from strenum import StrEnum
|
||||
from zhipuai import ZhipuAI
|
||||
|
||||
from common.token_utils import num_tokens_from_string, total_token_count_from_response
|
||||
from rag.llm import FACTORY_DEFAULT_BASE_URL, LITELLM_PROVIDER_PREFIX, SupportedLiteLLMProvider
|
||||
from rag.nlp import is_chinese, is_english
|
||||
from common.token_utils import num_tokens_from_string, total_token_count_from_response
|
||||
|
||||
|
||||
# Error message constants
|
||||
@@ -66,7 +66,7 @@ LENGTH_NOTIFICATION_EN = "...\nThe answer is truncated by your chosen LLM due to
|
||||
|
||||
class Base(ABC):
|
||||
def __init__(self, key, model_name, base_url, **kwargs):
|
||||
timeout = int(os.environ.get("LM_TIMEOUT_SECONDS", 600))
|
||||
timeout = int(os.environ.get("LLM_TIMEOUT_SECONDS", 600))
|
||||
self.client = OpenAI(api_key=key, base_url=base_url, timeout=timeout)
|
||||
self.model_name = model_name
|
||||
# Configure retry parameters
|
||||
@@ -127,7 +127,7 @@ class Base(ABC):
|
||||
"tool_choice",
|
||||
"logprobs",
|
||||
"top_logprobs",
|
||||
"extra_headers"
|
||||
"extra_headers",
|
||||
}
|
||||
|
||||
gen_conf = {k: v for k, v in gen_conf.items() if k in allowed_conf}
|
||||
@@ -1213,7 +1213,7 @@ class GoogleChat(Base):
|
||||
|
||||
# Build GenerateContentConfig
|
||||
try:
|
||||
from google.genai.types import GenerateContentConfig, ThinkingConfig, Content, Part
|
||||
from google.genai.types import Content, GenerateContentConfig, Part, ThinkingConfig
|
||||
except ImportError as e:
|
||||
logging.error(f"[GoogleChat] Failed to import google-genai: {e}. Please install: pip install google-genai>=1.41.0")
|
||||
raise
|
||||
@@ -1242,14 +1242,14 @@ class GoogleChat(Base):
|
||||
role = "model" if item["role"] == "assistant" else item["role"]
|
||||
content = Content(
|
||||
role=role,
|
||||
parts=[Part(text=item["content"])]
|
||||
parts=[Part(text=item["content"])],
|
||||
)
|
||||
contents.append(content)
|
||||
|
||||
response = self.client.models.generate_content(
|
||||
model=self.model_name,
|
||||
contents=contents,
|
||||
config=config
|
||||
config=config,
|
||||
)
|
||||
|
||||
ans = response.text
|
||||
@@ -1299,7 +1299,7 @@ class GoogleChat(Base):
|
||||
|
||||
# Build GenerateContentConfig
|
||||
try:
|
||||
from google.genai.types import GenerateContentConfig, ThinkingConfig, Content, Part
|
||||
from google.genai.types import Content, GenerateContentConfig, Part, ThinkingConfig
|
||||
except ImportError as e:
|
||||
logging.error(f"[GoogleChat] Failed to import google-genai: {e}. Please install: pip install google-genai>=1.41.0")
|
||||
raise
|
||||
@@ -1326,7 +1326,7 @@ class GoogleChat(Base):
|
||||
role = "model" if item["role"] == "assistant" else item["role"]
|
||||
content = Content(
|
||||
role=role,
|
||||
parts=[Part(text=item["content"])]
|
||||
parts=[Part(text=item["content"])],
|
||||
)
|
||||
contents.append(content)
|
||||
|
||||
@@ -1334,7 +1334,7 @@ class GoogleChat(Base):
|
||||
for chunk in self.client.models.generate_content_stream(
|
||||
model=self.model_name,
|
||||
contents=contents,
|
||||
config=config
|
||||
config=config,
|
||||
):
|
||||
text = chunk.text
|
||||
ans = text
|
||||
@@ -1406,7 +1406,7 @@ class LiteLLMBase(ABC):
|
||||
]
|
||||
|
||||
def __init__(self, key, model_name, base_url=None, **kwargs):
|
||||
self.timeout = int(os.environ.get("LM_TIMEOUT_SECONDS", 600))
|
||||
self.timeout = int(os.environ.get("LLM_TIMEOUT_SECONDS", 600))
|
||||
self.provider = kwargs.get("provider", "")
|
||||
self.prefix = LITELLM_PROVIDER_PREFIX.get(self.provider, "")
|
||||
self.model_name = f"{self.prefix}{model_name}"
|
||||
@@ -1625,6 +1625,7 @@ class LiteLLMBase(ABC):
|
||||
|
||||
if self.provider == SupportedLiteLLMProvider.OpenRouter:
|
||||
if self.provider_order:
|
||||
|
||||
def _to_order_list(x):
|
||||
if x is None:
|
||||
return []
|
||||
@@ -1633,6 +1634,7 @@ class LiteLLMBase(ABC):
|
||||
if isinstance(x, (list, tuple)):
|
||||
return [str(s).strip() for s in x if str(s).strip()]
|
||||
return []
|
||||
|
||||
extra_body = {}
|
||||
provider_cfg = {}
|
||||
provider_order = _to_order_list(self.provider_order)
|
||||
|
||||
Reference in New Issue
Block a user