add file support type choose

Merge remote-tracking branch 'origin/main' into feat/add-unstructured-support
unstructured api support
2025-12-07 03:45:27 +08:00 · 2023-12-13 16:40:27 +08:00 · 2023-12-13 14:33:46 +08:00 · 2023-11-22 15:09:24 +08:00 · 2023-11-21 17:10:05 +08:00 · 2023-11-21 15:00:19 +08:00
5 changed files with 93 additions and 9 deletions
--- a/api/controllers/console/datasets/file.py
+++ b/api/controllers/console/datasets/file.py
@@ -69,5 +69,20 @@ class FilePreviewApi(Resource):
        return {'content': text}


+class FileeSupportTypApi(Resource):
+    @setup_required
+    @login_required
+    @account_initialization_required
+    def get(self):
+        etl_type = current_app.config['ETL_TYPE']
+        if etl_type == 'sad':
+            allowed_extensions = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'docx', 'csv']
+        else:
+            allowed_extensions = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx',
+                                  'docx', 'csv', 'eml', 'msg', 'pptx', 'ppt', 'xml']
+        return {'allowed_extensions': allowed_extensions}
+
+
 api.add_resource(FileApi, '/files/upload')
 api.add_resource(FilePreviewApi, '/files/<uuid:file_id>/preview')
+api.add_resource(FileeSupportTypApi, '/files/support-type')
--- a/api/core/data_loader/file_extractor.py
+++ b/api/core/data_loader/file_extractor.py
@@ -3,7 +3,7 @@ from pathlib import Path
 from typing import List, Union, Optional

 import requests
-from langchain.document_loaders import TextLoader, Docx2txtLoader, UnstructuredFileLoader, UnstructuredAPIFileLoader
+from langchain.document_loaders import TextLoader, Docx2txtLoader, UnstructuredAPIFileLoader
 from langchain.schema import Document

 from core.data_loader.loader.csv_loader import CSVLoader
@@ -50,13 +50,27 @@ class FileExtractor:
        delimiter = '\n'
        file_extension = input_file.suffix.lower()
        if is_automatic:
-            loader = UnstructuredFileLoader(
-                file_path, strategy="hi_res", mode="elements"
+            if file_extension == '.xlsx':
+                loader = ExcelLoader(file_path)
+            elif file_extension == '.pdf':
+                loader = PdfLoader(file_path, upload_file=upload_file)
+            elif file_extension in ['.md', '.markdown']:
+                loader = MarkdownLoader(file_path, autodetect_encoding=True)
+            elif file_extension in ['.htm', '.html']:
+                loader = HTMLLoader(file_path)
+            elif file_extension == '.docx':
+                loader = Docx2txtLoader(file_path)
+            elif file_extension == '.csv':
+                loader = CSVLoader(file_path, autodetect_encoding=True)
+            else:
+                # txt
+                loader = TextLoader(file_path, autodetect_encoding=True)
+            loader = UnstructuredAPIFileLoader(
+                file_path=file_path,
+                url="http://127.0.0.1:8000/general/v0/general",
+                mode='single',
+                strategy='auto'
            )
-            # loader = UnstructuredAPIFileLoader(
-            #     file_path=filenames[0],
-            #     api_key="FAKE_API_KEY",
-            # )
        else:
            if file_extension == '.xlsx':
                loader = ExcelLoader(file_path)
--- a/api/core/data_loader/loader/unstructured/unstructured_pdf.py
+++ b/api/core/data_loader/loader/unstructured/unstructured_pdf.py
@@ -0,0 +1,55 @@
+import logging
+from typing import List, Optional
+
+from langchain.document_loaders import PyPDFium2Loader
+from langchain.document_loaders.base import BaseLoader
+from langchain.schema import Document
+
+from extensions.ext_storage import storage
+from models.model import UploadFile
+
+logger = logging.getLogger(__name__)
+
+
+class UnstructuredPdfLoader(BaseLoader):
+    """Load pdf files.
+
+
+    Args:
+        file_path: Path to the file to load.
+    """
+
+    def __init__(
+        self,
+        file_path: str,
+        upload_file: Optional[UploadFile] = None
+    ):
+        """Initialize with file path."""
+        self._file_path = file_path
+        self._upload_file = upload_file
+
+    def load(self) -> List[Document]:
+        plaintext_file_key = ''
+        plaintext_file_exists = False
+        if self._upload_file:
+            if self._upload_file.hash:
+                plaintext_file_key = 'upload_files/' + self._upload_file.tenant_id + '/' \
+                                     + self._upload_file.hash + '.0625.plaintext'
+                try:
+                    text = storage.load(plaintext_file_key).decode('utf-8')
+                    plaintext_file_exists = True
+                    return [Document(page_content=text)]
+                except FileNotFoundError:
+                    pass
+        documents = PyPDFium2Loader(file_path=self._file_path).load()
+        text_list = []
+        for document in documents:
+            text_list.append(document.page_content)
+        text = "\n\n".join(text_list)
+
+        # save plaintext file for caching
+        if not plaintext_file_exists and plaintext_file_key:
+            storage.save(plaintext_file_key, text.encode('utf-8'))
+
+        return documents
+
--- a/api/core/indexing_runner.py
+++ b/api/core/indexing_runner.py
@@ -397,7 +397,7 @@ class IndexingRunner:
                one_or_none()

            if file_detail:
-                text_docs = FileExtractor.load(file_detail, is_automatic=False)
+                text_docs = FileExtractor.load(file_detail, is_automatic=True)
        elif dataset_document.data_source_type == 'notion_import':
            loader = NotionLoader.from_document(dataset_document)
            text_docs = loader.load()
--- a/api/models/dataset.py
+++ b/api/models/dataset.py
@@ -135,7 +135,7 @@ class DatasetProcessRule(db.Model):
        ],
        'segmentation': {
            'delimiter': '\n',
-            'max_tokens': 512
+            'max_tokens': 1000
        }
    }
Author	SHA1	Message	Date
jyong	a6b2ac760f	add file support type choose	2023-12-13 16:40:27 +08:00
jyong	c1a0cb429a	Merge remote-tracking branch 'origin/main' into feat/add-unstructured-support	2023-12-13 14:33:46 +08:00
jyong	3ac204f599	unstructured api support	2023-11-22 15:09:24 +08:00
jyong	7d17d36243	unstructured api support	2023-11-21 17:10:05 +08:00
jyong	41cbd9bacc	add unstructured support	2023-11-21 15:00:19 +08:00