Feat: Add TCADP parser for PPTX and spreadsheet document types. (#11041)

### What problem does this PR solve? - Added TCADP Parser configuration fields to PDF, PPT, and spreadsheet parsing forms - Implemented support for setting table result type (Markdown/HTML) and Markdown image response type (URL/Text) - Updated TCADP Parser to handle return format settings from configuration or parameters - Enhanced frontend to dynamically show TCADP options based on selected parsing method - Modified backend to pass format parameters when calling TCADP API - Optimized form default value logic for TCADP configuration items - Updated multilingual resource files for new configuration options ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2025-12-06 15:29:03 +08:00 · 2025-11-20 10:08:42 +08:00
parent ecf0322165
commit 420c97199a
18 changed files with 668 additions and 37 deletions
--- a/conf/service_conf.yaml
+++ b/conf/service_conf.yaml
@@ -147,5 +147,3 @@ user_default_llm:
 #  secret_id: 'tencent_secret_id'
 #  secret_key: 'tencent_secret_key'
 #  region: 'tencent_region'
-#  table_result_type: '1'
-#  markdown_image_response_type: '1'
--- a/deepdoc/parser/tcadp_parser.py
+++ b/deepdoc/parser/tcadp_parser.py
@@ -192,12 +192,16 @@ class TencentCloudAPIClient:


 class TCADPParser(RAGFlowPdfParser):
-    def __init__(self, secret_id: str = None, secret_key: str = None, region: str = "ap-guangzhou"):
+    def __init__(self, secret_id: str = None, secret_key: str = None, region: str = "ap-guangzhou", 
+                 table_result_type: str = None, markdown_image_response_type: str = None):
        super().__init__()
        
        # First initialize logger
        self.logger = logging.getLogger(self.__class__.__name__)
        
+        # Log received parameters
+        self.logger.info(f"[TCADP] Initializing with parameters - table_result_type: {table_result_type}, markdown_image_response_type: {markdown_image_response_type}")
+        
        # Priority: read configuration from RAGFlow configuration system (service_conf.yaml)
        try:
            tcadp_parser = get_base_config("tcadp_config", {})
@@ -205,14 +209,30 @@ class TCADPParser(RAGFlowPdfParser):
                self.secret_id = secret_id or tcadp_parser.get("secret_id")
                self.secret_key = secret_key or tcadp_parser.get("secret_key")
                self.region = region or tcadp_parser.get("region", "ap-guangzhou")
-                self.table_result_type = tcadp_parser.get("table_result_type", "1")
-                self.markdown_image_response_type = tcadp_parser.get("markdown_image_response_type", "1")
-                self.logger.info("[TCADP] Configuration read from service_conf.yaml")
+                # Set table_result_type and markdown_image_response_type from config or parameters
+                self.table_result_type = table_result_type if table_result_type is not None else tcadp_parser.get("table_result_type", "1")
+                self.markdown_image_response_type = markdown_image_response_type if markdown_image_response_type is not None else tcadp_parser.get("markdown_image_response_type", "1")
+                
            else:
                self.logger.error("[TCADP] Please configure tcadp_config in service_conf.yaml first")
+                # If config file is empty, use provided parameters or defaults
+                self.secret_id = secret_id
+                self.secret_key = secret_key
+                self.region = region or "ap-guangzhou"
+                self.table_result_type = table_result_type if table_result_type is not None else "1"
+                self.markdown_image_response_type = markdown_image_response_type if markdown_image_response_type is not None else "1"

        except ImportError:
            self.logger.info("[TCADP] Configuration module import failed")
+            # If config file is not available, use provided parameters or defaults
+            self.secret_id = secret_id
+            self.secret_key = secret_key
+            self.region = region or "ap-guangzhou"
+            self.table_result_type = table_result_type if table_result_type is not None else "1"
+            self.markdown_image_response_type = markdown_image_response_type if markdown_image_response_type is not None else "1"
+
+        # Log final values
+        self.logger.info(f"[TCADP] Final values - table_result_type: {self.table_result_type}, markdown_image_response_type: {self.markdown_image_response_type}")

        if not self.secret_id or not self.secret_key:
            raise ValueError("[TCADP] Please set Tencent Cloud API keys, configure tcadp_config in service_conf.yaml")
@@ -400,6 +420,8 @@ class TCADPParser(RAGFlowPdfParser):
                        "TableResultType": self.table_result_type,
                        "MarkdownImageResponseType": self.markdown_image_response_type
                    }
+                    
+                    self.logger.info(f"[TCADP] API request config - TableResultType: {self.table_result_type}, MarkdownImageResponseType: {self.markdown_image_response_type}")

                    result = client.reconstruct_document_sse(
                        file_type=file_type, 
--- a/docker/service_conf.yaml.template
+++ b/docker/service_conf.yaml.template
@@ -150,5 +150,3 @@ user_default_llm:
 #   secret_id: '${TENCENT_SECRET_ID}'
 #   secret_key: '${TENCENT_SECRET_KEY}'
 #   region: '${TENCENT_REGION}'
-#   table_result_type: '1'
-#   markdown_image_response_type: '1'
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@@ -116,7 +116,7 @@ def by_plaintext(filename, binary=None, from_page=0, to_page=100000, callback=No
    else:
        vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT, llm_name=kwargs.get("layout_recognizer", ""), lang=kwargs.get("lang", "Chinese"))
        pdf_parser = VisionParser(vision_model=vision_model, **kwargs)
-    
+
    sections, tables = pdf_parser(
        filename if not binary else binary,
        from_page=from_page,
@@ -504,7 +504,7 @@ class Markdown(MarkdownParser):

        return images if images else None

-    def __call__(self, filename, binary=None, separate_tables=True,delimiter=None):
+    def __call__(self, filename, binary=None, separate_tables=True, delimiter=None):
        if binary:
            encoding = find_codec(binary)
            txt = binary.decode(encoding, errors="ignore")
@@ -602,7 +602,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
        _SerializedRelationships.load_from_xml = load_from_xml_v2
        sections, tables = Docx()(filename, binary)

-        tables=vision_figure_parser_docx_wrapper(sections=sections,tbls=tables,callback=callback,**kwargs)
+        tables = vision_figure_parser_docx_wrapper(sections=sections, tbls=tables, callback=callback, **kwargs)

        res = tokenize_table(tables, doc, is_english)
        callback(0.8, "Finish parsing.")
@@ -653,18 +653,47 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,

        if name in ["tcadp", "docling", "mineru"]:
            parser_config["chunk_token_num"] = 0
-        
+
        res = tokenize_table(tables, doc, is_english)
        callback(0.8, "Finish parsing.")

    elif re.search(r"\.(csv|xlsx?)$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
-        excel_parser = ExcelParser()
-        if parser_config.get("html4excel"):
-            sections = [(_, "") for _ in excel_parser.html(binary, 12) if _]
+
+        # Check if tcadp_parser is selected for spreadsheet files
+        layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
+        if layout_recognizer == "TCADP Parser":
+            table_result_type = parser_config.get("table_result_type", "1")
+            markdown_image_response_type = parser_config.get("markdown_image_response_type", "1")
+            tcadp_parser = TCADPParser(
+                table_result_type=table_result_type,
+                markdown_image_response_type=markdown_image_response_type
+            )
+            if not tcadp_parser.check_installation():
+                callback(-1, "TCADP parser not available. Please check Tencent Cloud API configuration.")
+                return res
+
+            # Determine file type based on extension
+            file_type = "XLSX" if re.search(r"\.xlsx?$", filename, re.IGNORECASE) else "CSV"
+
+            sections, tables = tcadp_parser.parse_pdf(
+                filepath=filename,
+                binary=binary,
+                callback=callback,
+                output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""),
+                file_type=file_type
+            )
+            parser_config["chunk_token_num"] = 0
+            res = tokenize_table(tables, doc, is_english)
+            callback(0.8, "Finish parsing.")
        else:
-            sections = [(_, "") for _ in excel_parser(binary) if _]
-        parser_config["chunk_token_num"] = 12800
+            # Default DeepDOC parser
+            excel_parser = ExcelParser()
+            if parser_config.get("html4excel"):
+                sections = [(_, "") for _ in excel_parser.html(binary, 12) if _]
+            else:
+                sections = [(_, "") for _ in excel_parser(binary) if _]
+            parser_config["chunk_token_num"] = 12800

    elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
@@ -676,7 +705,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
    elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128)))
-        sections, tables = markdown_parser(filename, binary, separate_tables=False,delimiter=parser_config.get("delimiter", "\n!?;。；！？"))
+        sections, tables = markdown_parser(filename, binary, separate_tables=False, delimiter=parser_config.get("delimiter", "\n!?;。；！？"))

        try:
            vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
--- a/rag/flow/parser/parser.py
+++ b/rag/flow/parser/parser.py
@@ -16,6 +16,7 @@ import io
 import json
 import os
 import random
+import re
 from functools import partial

 import trio
@@ -83,6 +84,7 @@ class ParserParam(ProcessParamBase):
                "output_format": "json",
            },
            "spreadsheet": {
+                "parse_method": "deepdoc",  # deepdoc/tcadp_parser
                "output_format": "html",
                "suffix": [
                    "xls",
@@ -102,8 +104,10 @@ class ParserParam(ProcessParamBase):
                "output_format": "json",
            },
            "slides": {
+                "parse_method": "deepdoc",  # deepdoc/tcadp_parser
                "suffix": [
                    "pptx",
+                    "ppt"
                ],
                "output_format": "json",
            },
@@ -245,7 +249,12 @@ class Parser(ProcessBase):
                bboxes.append(box)
        elif conf.get("parse_method").lower() == "tcadp parser":
            # ADP is a document parsing tool using Tencent Cloud API
-            tcadp_parser = TCADPParser()
+            table_result_type = conf.get("table_result_type", "1")
+            markdown_image_response_type = conf.get("markdown_image_response_type", "1")
+            tcadp_parser = TCADPParser(
+                table_result_type=table_result_type,
+                markdown_image_response_type=markdown_image_response_type
+            )
            sections, _ = tcadp_parser.parse_pdf(
                filepath=name,
                binary=blob,
@@ -301,14 +310,86 @@ class Parser(ProcessBase):
        self.callback(random.randint(1, 5) / 100.0, "Start to work on a Spreadsheet.")
        conf = self._param.setups["spreadsheet"]
        self.set_output("output_format", conf["output_format"])
-        spreadsheet_parser = ExcelParser()
-        if conf.get("output_format") == "html":
-            htmls = spreadsheet_parser.html(blob, 1000000000)
-            self.set_output("html", htmls[0])
-        elif conf.get("output_format") == "json":
-            self.set_output("json", [{"text": txt} for txt in spreadsheet_parser(blob) if txt])
-        elif conf.get("output_format") == "markdown":
-            self.set_output("markdown", spreadsheet_parser.markdown(blob))
+
+        parse_method = conf.get("parse_method", "deepdoc")
+
+        # Handle TCADP parser
+        if parse_method.lower() == "tcadp parser":
+            table_result_type = conf.get("table_result_type", "1")
+            markdown_image_response_type = conf.get("markdown_image_response_type", "1")
+            tcadp_parser = TCADPParser(
+                table_result_type=table_result_type,
+                markdown_image_response_type=markdown_image_response_type
+            )
+            if not tcadp_parser.check_installation():
+                raise RuntimeError("TCADP parser not available. Please check Tencent Cloud API configuration.")
+
+            # Determine file type based on extension
+            if re.search(r"\.xlsx?$", name, re.IGNORECASE):
+                file_type = "XLSX"
+            else:
+                file_type = "CSV"
+
+            self.callback(0.2, f"Using TCADP parser for {file_type} file.")
+            sections, tables = tcadp_parser.parse_pdf(
+                filepath=name,
+                binary=blob,
+                callback=self.callback,
+                file_type=file_type,
+                file_start_page=1,
+                file_end_page=1000
+            )
+
+            # Process TCADP parser output based on configured output_format
+            output_format = conf.get("output_format", "html")
+
+            if output_format == "html":
+                # For HTML output, combine sections and tables into HTML
+                html_content = ""
+                for section, position_tag in sections:
+                    if section:
+                        html_content += section + "\n"
+                for table in tables:
+                    if table:
+                        html_content += table + "\n"
+
+                self.set_output("html", html_content)
+
+            elif output_format == "json":
+                # For JSON output, create a list of text items
+                result = []
+                # Add sections as text
+                for section, position_tag in sections:
+                    if section:
+                        result.append({"text": section})
+                # Add tables as text
+                for table in tables:
+                    if table:
+                        result.append({"text": table})
+
+                self.set_output("json", result)
+
+            elif output_format == "markdown":
+                # For markdown output, combine into markdown
+                md_content = ""
+                for section, position_tag in sections:
+                    if section:
+                        md_content += section + "\n\n"
+                for table in tables:
+                    if table:
+                        md_content += table + "\n\n"
+
+                self.set_output("markdown", md_content)
+        else:
+            # Default DeepDOC parser
+            spreadsheet_parser = ExcelParser()
+            if conf.get("output_format") == "html":
+                htmls = spreadsheet_parser.html(blob, 1000000000)
+                self.set_output("html", htmls[0])
+            elif conf.get("output_format") == "json":
+                self.set_output("json", [{"text": txt} for txt in spreadsheet_parser(blob) if txt])
+            elif conf.get("output_format") == "markdown":
+                self.set_output("markdown", spreadsheet_parser.markdown(blob))

    def _word(self, name, blob):
        self.callback(random.randint(1, 5) / 100.0, "Start to work on a Word Processor Document")
@@ -326,22 +407,69 @@ class Parser(ProcessBase):
            self.set_output("markdown", markdown_text)

    def _slides(self, name, blob):
-        from deepdoc.parser.ppt_parser import RAGFlowPptParser as ppt_parser
-
        self.callback(random.randint(1, 5) / 100.0, "Start to work on a PowerPoint Document")

        conf = self._param.setups["slides"]
        self.set_output("output_format", conf["output_format"])

-        ppt_parser = ppt_parser()
-        txts = ppt_parser(blob, 0, 100000, None)
+        parse_method = conf.get("parse_method", "deepdoc")

-        sections = [{"text": section} for section in txts if section.strip()]
+        # Handle TCADP parser
+        if parse_method.lower() == "tcadp parser":
+            table_result_type = conf.get("table_result_type", "1")
+            markdown_image_response_type = conf.get("markdown_image_response_type", "1")
+            tcadp_parser = TCADPParser(
+                table_result_type=table_result_type,
+                markdown_image_response_type=markdown_image_response_type
+            )
+            if not tcadp_parser.check_installation():
+                raise RuntimeError("TCADP parser not available. Please check Tencent Cloud API configuration.")

-        # json
-        assert conf.get("output_format") == "json", "have to be json for ppt"
-        if conf.get("output_format") == "json":
-            self.set_output("json", sections)
+            # Determine file type based on extension
+            if re.search(r"\.pptx?$", name, re.IGNORECASE):
+                file_type = "PPTX"
+            else:
+                file_type = "PPT"
+
+            self.callback(0.2, f"Using TCADP parser for {file_type} file.")
+
+            sections, tables = tcadp_parser.parse_pdf(
+                filepath=name,
+                binary=blob,
+                callback=self.callback,
+                file_type=file_type,
+                file_start_page=1,
+                file_end_page=1000
+            )
+
+            # Process TCADP parser output - PPT only supports json format
+            output_format = conf.get("output_format", "json")
+            if output_format == "json":
+                # For JSON output, create a list of text items
+                result = []
+                # Add sections as text
+                for section, position_tag in sections:
+                    if section:
+                        result.append({"text": section})
+                # Add tables as text
+                for table in tables:
+                    if table:
+                        result.append({"text": table})
+
+                self.set_output("json", result)
+        else:
+            # Default DeepDOC parser (supports .pptx format)
+            from deepdoc.parser.ppt_parser import RAGFlowPptParser as ppt_parser
+
+            ppt_parser = ppt_parser()
+            txts = ppt_parser(blob, 0, 100000, None)
+
+            sections = [{"text": section} for section in txts if section.strip()]
+
+            # json
+            assert conf.get("output_format") == "json", "have to be json for ppt"
+            if conf.get("output_format") == "json":
+                self.set_output("json", sections)

    def _markdown(self, name, blob):
        from functools import reduce
@@ -579,6 +707,7 @@ class Parser(ProcessBase):
            "video": self._video,
            "email": self._email,
        }
+
        try:
            from_upstream = ParserFromUpstream.model_validate(kwargs)
        except Exception as e:
--- a/web/src/locales/en.ts
+++ b/web/src/locales/en.ts
@@ -1752,6 +1752,8 @@ The variable aggregation node (originally the variable assignment node) is a cru
 The Indexer will store the content in the corresponding data structures for the selected methods.`,
      // file: 'File',
      parserMethod: 'PDF parser',
+      tableResultType: 'Table Result Type',
+      markdownImageResponseType: 'Markdown Image Response Type',
      // systemPrompt: 'System Prompt',
      systemPromptPlaceholder:
        'Enter system prompt for image analysis, if empty the system default value will be used',
--- a/web/src/locales/zh.ts
+++ b/web/src/locales/zh.ts
@@ -1629,6 +1629,8 @@ General：实体和关系提取提示来自 GitHub - microsoft/graphrag：基于
 Tokenizer 会根据所选方式将内容存储为对应的数据结构。`,
      filenameEmbdWeight: '文件名嵌入权重',
      parserMethod: '解析方法',
+      tableResultType: '表格返回形式',
+      markdownImageResponseType: '图片返回形式',
      systemPromptPlaceholder:
        '请输入用于图像分析的系统提示词，若为空则使用系统缺省值',
      exportJson: '导出 JSON',
--- a/web/src/pages/agent/constant/pipeline.tsx
+++ b/web/src/pages/agent/constant/pipeline.tsx
@@ -169,6 +169,7 @@ export const initialParserValues = {
    {
      fileFormat: FileType.Spreadsheet,
      output_format: SpreadsheetOutputFormat.Html,
+      parse_method: ParseDocumentType.DeepDOC,
    },
    {
      fileFormat: FileType.Image,
@@ -192,6 +193,7 @@ export const initialParserValues = {
    {
      fileFormat: FileType.PowerPoint,
      output_format: PptOutputFormat.Json,
+      parse_method: ParseDocumentType.DeepDOC,
    },
  ],
 };
@@ -243,7 +245,7 @@ export const FileTypeSuffixMap = {
  [FileType.Email]: ['eml', 'msg'],
  [FileType.TextMarkdown]: ['md', 'markdown', 'mdx', 'txt'],
  [FileType.Docx]: ['doc', 'docx'],
-  [FileType.PowerPoint]: ['pptx'],
+  [FileType.PowerPoint]: ['pptx', 'ppt'],
  [FileType.Video]: ['mp4', 'avi', 'mkv'],
  [FileType.Audio]: [
    'da',
--- a/web/src/pages/agent/form/parser-form/index.tsx
+++ b/web/src/pages/agent/form/parser-form/index.tsx
@@ -34,6 +34,8 @@ import { OutputFormatFormField } from './common-form-fields';
 import { EmailFormFields } from './email-form-fields';
 import { ImageFormFields } from './image-form-fields';
 import { PdfFormFields } from './pdf-form-fields';
+import { PptFormFields } from './ppt-form-fields';
+import { SpreadsheetFormFields } from './spreadsheet-form-fields';
 import { buildFieldNameWithPrefix } from './utils';
 import { AudioFormFields, VideoFormFields } from './video-form-fields';

@@ -41,6 +43,8 @@ const outputList = buildOutputList(initialParserValues.outputs);

 const FileFormatWidgetMap = {
  [FileType.PDF]: PdfFormFields,
+  [FileType.Spreadsheet]: SpreadsheetFormFields,
+  [FileType.PowerPoint]: PptFormFields,
  [FileType.Video]: VideoFormFields,
  [FileType.Audio]: AudioFormFields,
  [FileType.Email]: EmailFormFields,
@@ -65,6 +69,8 @@ export const FormSchema = z.object({
      fields: z.array(z.string()).optional(),
      llm_id: z.string().optional(),
      system_prompt: z.string().optional(),
+      table_result_type: z.string().optional(),
+      markdown_image_response_type: z.string().optional(),
    }),
  ),
 });
@@ -184,6 +190,8 @@ const ParserForm = ({ node }: INextOperatorForm) => {
      lang: '',
      fields: [],
      llm_id: '',
+      table_result_type: '',
+      markdown_image_response_type: '',
    });
  }, [append]);

--- a/web/src/pages/agent/form/parser-form/pdf-form-fields.tsx
+++ b/web/src/pages/agent/form/parser-form/pdf-form-fields.tsx
@@ -1,13 +1,30 @@
 import { ParseDocumentType } from '@/components/layout-recognize-form-field';
+import {
+  SelectWithSearch,
+  SelectWithSearchFlagOptionType,
+} from '@/components/originui/select-with-search';
+import { RAGFlowFormItem } from '@/components/ragflow-form';
 import { isEmpty } from 'lodash';
 import { useEffect, useMemo } from 'react';
 import { useFormContext, useWatch } from 'react-hook-form';
+import { useTranslation } from 'react-i18next';
 import { LanguageFormField, ParserMethodFormField } from './common-form-fields';
 import { CommonProps } from './interface';
 import { useSetInitialLanguage } from './use-set-initial-language';
 import { buildFieldNameWithPrefix } from './utils';

+const tableResultTypeOptions: SelectWithSearchFlagOptionType[] = [
+  { label: 'Markdown', value: '0' },
+  { label: 'HTML', value: '1' },
+];
+
+const markdownImageResponseTypeOptions: SelectWithSearchFlagOptionType[] = [
+  { label: 'URL', value: '0' },
+  { label: 'Text', value: '1' },
+];
+
 export function PdfFormFields({ prefix }: CommonProps) {
+  const { t } = useTranslation();
  const form = useFormContext();

  const parseMethodName = buildFieldNameWithPrefix('parse_method', prefix);
@@ -25,6 +42,12 @@ export function PdfFormFields({ prefix }: CommonProps) {
    );
  }, [parseMethod]);

+  const tcadpOptionsShown = useMemo(() => {
+    return (
+      !isEmpty(parseMethod) && parseMethod === ParseDocumentType.TCADPParser
+    );
+  }, [parseMethod]);
+
  useSetInitialLanguage({ prefix, languageShown });

  useEffect(() => {
@@ -36,10 +59,68 @@ export function PdfFormFields({ prefix }: CommonProps) {
    }
  }, [form, parseMethodName]);

+  // Set default values for TCADP options when TCADP is selected
+  useEffect(() => {
+    if (tcadpOptionsShown) {
+      const tableResultTypeName = buildFieldNameWithPrefix(
+        'table_result_type',
+        prefix,
+      );
+      const markdownImageResponseTypeName = buildFieldNameWithPrefix(
+        'markdown_image_response_type',
+        prefix,
+      );
+
+      if (isEmpty(form.getValues(tableResultTypeName))) {
+        form.setValue(tableResultTypeName, '1', {
+          shouldValidate: true,
+          shouldDirty: true,
+        });
+      }
+      if (isEmpty(form.getValues(markdownImageResponseTypeName))) {
+        form.setValue(markdownImageResponseTypeName, '1', {
+          shouldValidate: true,
+          shouldDirty: true,
+        });
+      }
+    }
+  }, [tcadpOptionsShown, form, prefix]);
+
  return (
    <>
      <ParserMethodFormField prefix={prefix}></ParserMethodFormField>
      {languageShown && <LanguageFormField prefix={prefix}></LanguageFormField>}
+      {tcadpOptionsShown && (
+        <>
+          <RAGFlowFormItem
+            name={buildFieldNameWithPrefix('table_result_type', prefix)}
+            label={t('flow.tableResultType') || '表格返回形式'}
+          >
+            {(field) => (
+              <SelectWithSearch
+                value={field.value}
+                onChange={field.onChange}
+                options={tableResultTypeOptions}
+              ></SelectWithSearch>
+            )}
+          </RAGFlowFormItem>
+          <RAGFlowFormItem
+            name={buildFieldNameWithPrefix(
+              'markdown_image_response_type',
+              prefix,
+            )}
+            label={t('flow.markdownImageResponseType') || '图片返回形式'}
+          >
+            {(field) => (
+              <SelectWithSearch
+                value={field.value}
+                onChange={field.onChange}
+                options={markdownImageResponseTypeOptions}
+              ></SelectWithSearch>
+            )}
+          </RAGFlowFormItem>
+        </>
+      )}
    </>
  );
 }
--- a/web/src/pages/agent/form/parser-form/ppt-form-fields.tsx
+++ b/web/src/pages/agent/form/parser-form/ppt-form-fields.tsx
@@ -0,0 +1,125 @@
+import { ParseDocumentType } from '@/components/layout-recognize-form-field';
+import {
+  SelectWithSearch,
+  SelectWithSearchFlagOptionType,
+} from '@/components/originui/select-with-search';
+import { RAGFlowFormItem } from '@/components/ragflow-form';
+import { isEmpty } from 'lodash';
+import { useEffect, useMemo } from 'react';
+import { useFormContext, useWatch } from 'react-hook-form';
+import { useTranslation } from 'react-i18next';
+import { ParserMethodFormField } from './common-form-fields';
+import { CommonProps } from './interface';
+import { buildFieldNameWithPrefix } from './utils';
+
+const tableResultTypeOptions: SelectWithSearchFlagOptionType[] = [
+  { label: 'Markdown', value: '0' },
+  { label: 'HTML', value: '1' },
+];
+
+const markdownImageResponseTypeOptions: SelectWithSearchFlagOptionType[] = [
+  { label: 'URL', value: '0' },
+  { label: 'Text', value: '1' },
+];
+
+export function PptFormFields({ prefix }: CommonProps) {
+  const { t } = useTranslation();
+  const form = useFormContext();
+
+  const parseMethodName = buildFieldNameWithPrefix('parse_method', prefix);
+
+  const parseMethod = useWatch({
+    name: parseMethodName,
+  });
+
+  // PPT only supports DeepDOC and TCADPParser
+  const optionsWithoutLLM = [
+    { label: ParseDocumentType.DeepDOC, value: ParseDocumentType.DeepDOC },
+    {
+      label: ParseDocumentType.TCADPParser,
+      value: ParseDocumentType.TCADPParser,
+    },
+  ];
+
+  const tcadpOptionsShown = useMemo(() => {
+    return (
+      !isEmpty(parseMethod) && parseMethod === ParseDocumentType.TCADPParser
+    );
+  }, [parseMethod]);
+
+  useEffect(() => {
+    if (isEmpty(form.getValues(parseMethodName))) {
+      form.setValue(parseMethodName, ParseDocumentType.DeepDOC, {
+        shouldValidate: true,
+        shouldDirty: true,
+      });
+    }
+  }, [form, parseMethodName]);
+
+  // Set default values for TCADP options when TCADP is selected
+  useEffect(() => {
+    if (tcadpOptionsShown) {
+      const tableResultTypeName = buildFieldNameWithPrefix(
+        'table_result_type',
+        prefix,
+      );
+      const markdownImageResponseTypeName = buildFieldNameWithPrefix(
+        'markdown_image_response_type',
+        prefix,
+      );
+
+      if (isEmpty(form.getValues(tableResultTypeName))) {
+        form.setValue(tableResultTypeName, '1', {
+          shouldValidate: true,
+          shouldDirty: true,
+        });
+      }
+      if (isEmpty(form.getValues(markdownImageResponseTypeName))) {
+        form.setValue(markdownImageResponseTypeName, '1', {
+          shouldValidate: true,
+          shouldDirty: true,
+        });
+      }
+    }
+  }, [tcadpOptionsShown, form, prefix]);
+
+  return (
+    <>
+      <ParserMethodFormField
+        prefix={prefix}
+        optionsWithoutLLM={optionsWithoutLLM}
+      ></ParserMethodFormField>
+      {tcadpOptionsShown && (
+        <>
+          <RAGFlowFormItem
+            name={buildFieldNameWithPrefix('table_result_type', prefix)}
+            label={t('flow.tableResultType') || '表格返回形式'}
+          >
+            {(field) => (
+              <SelectWithSearch
+                value={field.value}
+                onChange={field.onChange}
+                options={tableResultTypeOptions}
+              ></SelectWithSearch>
+            )}
+          </RAGFlowFormItem>
+          <RAGFlowFormItem
+            name={buildFieldNameWithPrefix(
+              'markdown_image_response_type',
+              prefix,
+            )}
+            label={t('flow.markdownImageResponseType') || '图片返回形式'}
+          >
+            {(field) => (
+              <SelectWithSearch
+                value={field.value}
+                onChange={field.onChange}
+                options={markdownImageResponseTypeOptions}
+              ></SelectWithSearch>
+            )}
+          </RAGFlowFormItem>
+        </>
+      )}
+    </>
+  );
+}
--- a/web/src/pages/agent/form/parser-form/spreadsheet-form-fields.tsx
+++ b/web/src/pages/agent/form/parser-form/spreadsheet-form-fields.tsx
@@ -0,0 +1,125 @@
+import { ParseDocumentType } from '@/components/layout-recognize-form-field';
+import {
+  SelectWithSearch,
+  SelectWithSearchFlagOptionType,
+} from '@/components/originui/select-with-search';
+import { RAGFlowFormItem } from '@/components/ragflow-form';
+import { isEmpty } from 'lodash';
+import { useEffect, useMemo } from 'react';
+import { useFormContext, useWatch } from 'react-hook-form';
+import { useTranslation } from 'react-i18next';
+import { ParserMethodFormField } from './common-form-fields';
+import { CommonProps } from './interface';
+import { buildFieldNameWithPrefix } from './utils';
+
+const tableResultTypeOptions: SelectWithSearchFlagOptionType[] = [
+  { label: 'Markdown', value: '0' },
+  { label: 'HTML', value: '1' },
+];
+
+const markdownImageResponseTypeOptions: SelectWithSearchFlagOptionType[] = [
+  { label: 'URL', value: '0' },
+  { label: 'Text', value: '1' },
+];
+
+export function SpreadsheetFormFields({ prefix }: CommonProps) {
+  const { t } = useTranslation();
+  const form = useFormContext();
+
+  const parseMethodName = buildFieldNameWithPrefix('parse_method', prefix);
+
+  const parseMethod = useWatch({
+    name: parseMethodName,
+  });
+
+  // Spreadsheet only supports DeepDOC and TCADPParser
+  const optionsWithoutLLM = [
+    { label: ParseDocumentType.DeepDOC, value: ParseDocumentType.DeepDOC },
+    {
+      label: ParseDocumentType.TCADPParser,
+      value: ParseDocumentType.TCADPParser,
+    },
+  ];
+
+  const tcadpOptionsShown = useMemo(() => {
+    return (
+      !isEmpty(parseMethod) && parseMethod === ParseDocumentType.TCADPParser
+    );
+  }, [parseMethod]);
+
+  useEffect(() => {
+    if (isEmpty(form.getValues(parseMethodName))) {
+      form.setValue(parseMethodName, ParseDocumentType.DeepDOC, {
+        shouldValidate: true,
+        shouldDirty: true,
+      });
+    }
+  }, [form, parseMethodName]);
+
+  // Set default values for TCADP options when TCADP is selected
+  useEffect(() => {
+    if (tcadpOptionsShown) {
+      const tableResultTypeName = buildFieldNameWithPrefix(
+        'table_result_type',
+        prefix,
+      );
+      const markdownImageResponseTypeName = buildFieldNameWithPrefix(
+        'markdown_image_response_type',
+        prefix,
+      );
+
+      if (isEmpty(form.getValues(tableResultTypeName))) {
+        form.setValue(tableResultTypeName, '1', {
+          shouldValidate: true,
+          shouldDirty: true,
+        });
+      }
+      if (isEmpty(form.getValues(markdownImageResponseTypeName))) {
+        form.setValue(markdownImageResponseTypeName, '1', {
+          shouldValidate: true,
+          shouldDirty: true,
+        });
+      }
+    }
+  }, [tcadpOptionsShown, form, prefix]);
+
+  return (
+    <>
+      <ParserMethodFormField
+        prefix={prefix}
+        optionsWithoutLLM={optionsWithoutLLM}
+      ></ParserMethodFormField>
+      {tcadpOptionsShown && (
+        <>
+          <RAGFlowFormItem
+            name={buildFieldNameWithPrefix('table_result_type', prefix)}
+            label={t('flow.tableResultType') || '表格返回形式'}
+          >
+            {(field) => (
+              <SelectWithSearch
+                value={field.value}
+                onChange={field.onChange}
+                options={tableResultTypeOptions}
+              ></SelectWithSearch>
+            )}
+          </RAGFlowFormItem>
+          <RAGFlowFormItem
+            name={buildFieldNameWithPrefix(
+              'markdown_image_response_type',
+              prefix,
+            )}
+            label={t('flow.markdownImageResponseType') || '图片返回形式'}
+          >
+            {(field) => (
+              <SelectWithSearch
+                value={field.value}
+                onChange={field.onChange}
+                options={markdownImageResponseTypeOptions}
+              ></SelectWithSearch>
+            )}
+          </RAGFlowFormItem>
+        </>
+      )}
+    </>
+  );
+}
--- a/web/src/pages/agent/utils.ts
+++ b/web/src/pages/agent/utils.ts
@@ -214,6 +214,36 @@ function transformParserParams(params: ParserFormSchemaType) {
            parse_method: cur.parse_method,
            lang: cur.lang,
          };
+          // Only include TCADP parameters if TCADP Parser is selected
+          if (cur.parse_method?.toLowerCase() === 'tcadp parser') {
+            filteredSetup.table_result_type = cur.table_result_type;
+            filteredSetup.markdown_image_response_type =
+              cur.markdown_image_response_type;
+          }
+          break;
+        case FileType.Spreadsheet:
+          filteredSetup = {
+            ...filteredSetup,
+            parse_method: cur.parse_method,
+          };
+          // Only include TCADP parameters if TCADP Parser is selected
+          if (cur.parse_method?.toLowerCase() === 'tcadp parser') {
+            filteredSetup.table_result_type = cur.table_result_type;
+            filteredSetup.markdown_image_response_type =
+              cur.markdown_image_response_type;
+          }
+          break;
+        case FileType.PowerPoint:
+          filteredSetup = {
+            ...filteredSetup,
+            parse_method: cur.parse_method,
+          };
+          // Only include TCADP parameters if TCADP Parser is selected
+          if (cur.parse_method?.toLowerCase() === 'tcadp parser') {
+            filteredSetup.table_result_type = cur.table_result_type;
+            filteredSetup.markdown_image_response_type =
+              cur.markdown_image_response_type;
+          }
          break;
        case FileType.Image:
          filteredSetup = {
--- a/web/src/pages/data-flow/constant.tsx
+++ b/web/src/pages/data-flow/constant.tsx
--- a/web/src/pages/data-flow/form/parser-form/index.tsx
+++ b/web/src/pages/data-flow/form/parser-form/index.tsx
--- a/web/src/pages/data-flow/form/parser-form/ppt-form-fields.tsx
+++ b/web/src/pages/data-flow/form/parser-form/ppt-form-fields.tsx
@@ -0,0 +1,40 @@
+import { ParseDocumentType } from '@/components/layout-recognize-form-field';
+import { isEmpty } from 'lodash';
+import { useEffect } from 'react';
+import { useFormContext } from 'react-hook-form';
+import { ParserMethodFormField } from './common-form-fields';
+import { CommonProps } from './interface';
+import { buildFieldNameWithPrefix } from './utils';
+
+export function PptFormFields({ prefix }: CommonProps) {
+  const form = useFormContext();
+
+  const parseMethodName = buildFieldNameWithPrefix('parse_method', prefix);
+
+  // PPT only supports DeepDOC and TCADPParser
+  const optionsWithoutLLM = [
+    { label: ParseDocumentType.DeepDOC, value: ParseDocumentType.DeepDOC },
+    {
+      label: ParseDocumentType.TCADPParser,
+      value: ParseDocumentType.TCADPParser,
+    },
+  ];
+
+  useEffect(() => {
+    if (isEmpty(form.getValues(parseMethodName))) {
+      form.setValue(parseMethodName, ParseDocumentType.DeepDOC, {
+        shouldValidate: true,
+        shouldDirty: true,
+      });
+    }
+  }, [form, parseMethodName]);
+
+  return (
+    <>
+      <ParserMethodFormField
+        prefix={prefix}
+        optionsWithoutLLM={optionsWithoutLLM}
+      ></ParserMethodFormField>
+    </>
+  );
+}
--- a/web/src/pages/data-flow/form/parser-form/spreadsheet-form-fields.tsx
+++ b/web/src/pages/data-flow/form/parser-form/spreadsheet-form-fields.tsx
@@ -0,0 +1,40 @@
+import { ParseDocumentType } from '@/components/layout-recognize-form-field';
+import { isEmpty } from 'lodash';
+import { useEffect } from 'react';
+import { useFormContext } from 'react-hook-form';
+import { ParserMethodFormField } from './common-form-fields';
+import { CommonProps } from './interface';
+import { buildFieldNameWithPrefix } from './utils';
+
+export function SpreadsheetFormFields({ prefix }: CommonProps) {
+  const form = useFormContext();
+
+  const parseMethodName = buildFieldNameWithPrefix('parse_method', prefix);
+
+  // Spreadsheet only supports DeepDOC and TCADPParser
+  const optionsWithoutLLM = [
+    { label: ParseDocumentType.DeepDOC, value: ParseDocumentType.DeepDOC },
+    {
+      label: ParseDocumentType.TCADPParser,
+      value: ParseDocumentType.TCADPParser,
+    },
+  ];
+
+  useEffect(() => {
+    if (isEmpty(form.getValues(parseMethodName))) {
+      form.setValue(parseMethodName, ParseDocumentType.DeepDOC, {
+        shouldValidate: true,
+        shouldDirty: true,
+      });
+    }
+  }, [form, parseMethodName]);
+
+  return (
+    <>
+      <ParserMethodFormField
+        prefix={prefix}
+        optionsWithoutLLM={optionsWithoutLLM}
+      ></ParserMethodFormField>
+    </>
+  );
+}
--- a/web/src/pages/data-flow/utils.ts
+++ b/web/src/pages/data-flow/utils.ts