Compare commits

...

5 Commits

Author SHA1 Message Date
jyong
a6b2ac760f add file support type choose 2023-12-13 16:40:27 +08:00
jyong
c1a0cb429a Merge remote-tracking branch 'origin/main' into feat/add-unstructured-support 2023-12-13 14:33:46 +08:00
jyong
3ac204f599 unstructured api support 2023-11-22 15:09:24 +08:00
jyong
7d17d36243 unstructured api support 2023-11-21 17:10:05 +08:00
jyong
41cbd9bacc add unstructured support 2023-11-21 15:00:19 +08:00
5 changed files with 93 additions and 9 deletions

View File

@@ -69,5 +69,20 @@ class FilePreviewApi(Resource):
return {'content': text}
class FileeSupportTypApi(Resource):
@setup_required
@login_required
@account_initialization_required
def get(self):
etl_type = current_app.config['ETL_TYPE']
if etl_type == 'sad':
allowed_extensions = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'docx', 'csv']
else:
allowed_extensions = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx',
'docx', 'csv', 'eml', 'msg', 'pptx', 'ppt', 'xml']
return {'allowed_extensions': allowed_extensions}
api.add_resource(FileApi, '/files/upload')
api.add_resource(FilePreviewApi, '/files/<uuid:file_id>/preview')
api.add_resource(FileeSupportTypApi, '/files/support-type')

View File

@@ -3,7 +3,7 @@ from pathlib import Path
from typing import List, Union, Optional
import requests
from langchain.document_loaders import TextLoader, Docx2txtLoader, UnstructuredFileLoader, UnstructuredAPIFileLoader
from langchain.document_loaders import TextLoader, Docx2txtLoader, UnstructuredAPIFileLoader
from langchain.schema import Document
from core.data_loader.loader.csv_loader import CSVLoader
@@ -50,13 +50,27 @@ class FileExtractor:
delimiter = '\n'
file_extension = input_file.suffix.lower()
if is_automatic:
loader = UnstructuredFileLoader(
file_path, strategy="hi_res", mode="elements"
if file_extension == '.xlsx':
loader = ExcelLoader(file_path)
elif file_extension == '.pdf':
loader = PdfLoader(file_path, upload_file=upload_file)
elif file_extension in ['.md', '.markdown']:
loader = MarkdownLoader(file_path, autodetect_encoding=True)
elif file_extension in ['.htm', '.html']:
loader = HTMLLoader(file_path)
elif file_extension == '.docx':
loader = Docx2txtLoader(file_path)
elif file_extension == '.csv':
loader = CSVLoader(file_path, autodetect_encoding=True)
else:
# txt
loader = TextLoader(file_path, autodetect_encoding=True)
loader = UnstructuredAPIFileLoader(
file_path=file_path,
url="http://127.0.0.1:8000/general/v0/general",
mode='single',
strategy='auto'
)
# loader = UnstructuredAPIFileLoader(
# file_path=filenames[0],
# api_key="FAKE_API_KEY",
# )
else:
if file_extension == '.xlsx':
loader = ExcelLoader(file_path)

View File

@@ -0,0 +1,55 @@
import logging
from typing import List, Optional
from langchain.document_loaders import PyPDFium2Loader
from langchain.document_loaders.base import BaseLoader
from langchain.schema import Document
from extensions.ext_storage import storage
from models.model import UploadFile
logger = logging.getLogger(__name__)
class UnstructuredPdfLoader(BaseLoader):
"""Load pdf files.
Args:
file_path: Path to the file to load.
"""
def __init__(
self,
file_path: str,
upload_file: Optional[UploadFile] = None
):
"""Initialize with file path."""
self._file_path = file_path
self._upload_file = upload_file
def load(self) -> List[Document]:
plaintext_file_key = ''
plaintext_file_exists = False
if self._upload_file:
if self._upload_file.hash:
plaintext_file_key = 'upload_files/' + self._upload_file.tenant_id + '/' \
+ self._upload_file.hash + '.0625.plaintext'
try:
text = storage.load(plaintext_file_key).decode('utf-8')
plaintext_file_exists = True
return [Document(page_content=text)]
except FileNotFoundError:
pass
documents = PyPDFium2Loader(file_path=self._file_path).load()
text_list = []
for document in documents:
text_list.append(document.page_content)
text = "\n\n".join(text_list)
# save plaintext file for caching
if not plaintext_file_exists and plaintext_file_key:
storage.save(plaintext_file_key, text.encode('utf-8'))
return documents

View File

@@ -397,7 +397,7 @@ class IndexingRunner:
one_or_none()
if file_detail:
text_docs = FileExtractor.load(file_detail, is_automatic=False)
text_docs = FileExtractor.load(file_detail, is_automatic=True)
elif dataset_document.data_source_type == 'notion_import':
loader = NotionLoader.from_document(dataset_document)
text_docs = loader.load()

View File

@@ -135,7 +135,7 @@ class DatasetProcessRule(db.Model):
],
'segmentation': {
'delimiter': '\n',
'max_tokens': 512
'max_tokens': 1000
}
}