diff --git a/api/core/rag/extractor/helpers.py b/api/core/rag/extractor/helpers.py index 00004409d6..5166c0c768 100644 --- a/api/core/rag/extractor/helpers.py +++ b/api/core/rag/extractor/helpers.py @@ -1,7 +1,9 @@ """Document loader helpers.""" import concurrent.futures -from typing import NamedTuple, cast +from typing import NamedTuple + +import charset_normalizer class FileEncoding(NamedTuple): @@ -27,14 +29,14 @@ def detect_file_encodings(file_path: str, timeout: int = 5, sample_size: int = 1 sample_size: The number of bytes to read for encoding detection. Default is 1MB. For large files, reading only a sample is sufficient and prevents timeout. """ - import chardet - def read_and_detect(file_path: str): - with open(file_path, "rb") as f: - # Read only a sample of the file for encoding detection - # This prevents timeout on large files while still providing accurate encoding detection - rawdata = f.read(sample_size) - return cast(list[dict], chardet.detect_all(rawdata)) + def read_and_detect(filename: str): + rst = charset_normalizer.from_path(filename) + best = rst.best() + if best is None: + return [] + file_encoding = FileEncoding(encoding=best.encoding, confidence=best.coherence, language=best.language) + return [file_encoding] with concurrent.futures.ThreadPoolExecutor() as executor: future = executor.submit(read_and_detect, file_path) diff --git a/api/core/tools/utils/web_reader_tool.py b/api/core/tools/utils/web_reader_tool.py index ef6913d0bd..ed3ed3e0de 100644 --- a/api/core/tools/utils/web_reader_tool.py +++ b/api/core/tools/utils/web_reader_tool.py @@ -5,7 +5,7 @@ from dataclasses import dataclass from typing import Any, cast from urllib.parse import unquote -import chardet +import charset_normalizer import cloudscraper from readabilipy import simple_json_from_html_string @@ -69,9 +69,12 @@ def get_url(url: str, user_agent: str | None = None) -> str: if response.status_code != 200: return f"URL returned status code {response.status_code}." - # Detect encoding using chardet - detected_encoding = chardet.detect(response.content) - encoding = detected_encoding["encoding"] + # Detect encoding using charset_normalizer + detected_encoding = charset_normalizer.from_bytes(response.content).best() + if detected_encoding: + encoding = detected_encoding.encoding + else: + encoding = "utf-8" if encoding: try: content = response.content.decode(encoding) diff --git a/api/core/workflow/nodes/document_extractor/node.py b/api/core/workflow/nodes/document_extractor/node.py index f05c5f9873..14ebd1f9ae 100644 --- a/api/core/workflow/nodes/document_extractor/node.py +++ b/api/core/workflow/nodes/document_extractor/node.py @@ -7,7 +7,7 @@ import tempfile from collections.abc import Mapping, Sequence from typing import Any -import chardet +import charset_normalizer import docx import pandas as pd import pypandoc @@ -228,9 +228,12 @@ def _extract_text_by_file_extension(*, file_content: bytes, file_extension: str) def _extract_text_from_plain_text(file_content: bytes) -> str: try: - # Detect encoding using chardet - result = chardet.detect(file_content) - encoding = result["encoding"] + # Detect encoding using charset_normalizer + result = charset_normalizer.from_bytes(file_content, cp_isolation=["utf_8", "latin_1", "cp1252"]).best() + if result: + encoding = result.encoding + else: + encoding = "utf-8" # Fallback to utf-8 if detection fails if not encoding: @@ -247,9 +250,12 @@ def _extract_text_from_plain_text(file_content: bytes) -> str: def _extract_text_from_json(file_content: bytes) -> str: try: - # Detect encoding using chardet - result = chardet.detect(file_content) - encoding = result["encoding"] + # Detect encoding using charset_normalizer + result = charset_normalizer.from_bytes(file_content).best() + if result: + encoding = result.encoding + else: + encoding = "utf-8" # Fallback to utf-8 if detection fails if not encoding: @@ -269,9 +275,12 @@ def _extract_text_from_json(file_content: bytes) -> str: def _extract_text_from_yaml(file_content: bytes) -> str: """Extract the content from yaml file""" try: - # Detect encoding using chardet - result = chardet.detect(file_content) - encoding = result["encoding"] + # Detect encoding using charset_normalizer + result = charset_normalizer.from_bytes(file_content).best() + if result: + encoding = result.encoding + else: + encoding = "utf-8" # Fallback to utf-8 if detection fails if not encoding: @@ -424,9 +433,12 @@ def _extract_text_from_file(file: File): def _extract_text_from_csv(file_content: bytes) -> str: try: - # Detect encoding using chardet - result = chardet.detect(file_content) - encoding = result["encoding"] + # Detect encoding using charset_normalizer + result = charset_normalizer.from_bytes(file_content).best() + if result: + encoding = result.encoding + else: + encoding = "utf-8" # Fallback to utf-8 if detection fails if not encoding: diff --git a/api/pyproject.toml b/api/pyproject.toml index 15f7798f99..f08e09eeb9 100644 --- a/api/pyproject.toml +++ b/api/pyproject.toml @@ -11,7 +11,7 @@ dependencies = [ "bs4~=0.0.1", "cachetools~=5.3.0", "celery~=5.5.2", - "chardet~=5.1.0", + "charset-normalizer>=3.4.4", "flask~=3.1.2", "flask-compress>=1.17,<1.18", "flask-cors~=6.0.0", diff --git a/api/tests/unit_tests/core/tools/utils/test_web_reader_tool.py b/api/tests/unit_tests/core/tools/utils/test_web_reader_tool.py index 0bf4a3cf91..1361e16b06 100644 --- a/api/tests/unit_tests/core/tools/utils/test_web_reader_tool.py +++ b/api/tests/unit_tests/core/tools/utils/test_web_reader_tool.py @@ -1,3 +1,5 @@ +from types import SimpleNamespace + import pytest from core.tools.utils.web_reader_tool import ( @@ -103,7 +105,10 @@ def test_get_url_html_flow_with_chardet_and_readability(monkeypatch: pytest.Monk monkeypatch.setattr(mod.ssrf_proxy, "head", fake_head) monkeypatch.setattr(mod.ssrf_proxy, "get", fake_get) - monkeypatch.setattr(mod.chardet, "detect", lambda b: {"encoding": "utf-8"}) + + mock_best = SimpleNamespace(encoding="utf-8") + mock_from_bytes = SimpleNamespace(best=lambda: mock_best) + monkeypatch.setattr(mod.charset_normalizer, "from_bytes", lambda _: mock_from_bytes) # readability → a dict that maps to Article, then FULL_TEMPLATE def fake_simple_json_from_html_string(html, use_readability=True): @@ -134,7 +139,9 @@ def test_get_url_html_flow_empty_article_text_returns_empty(monkeypatch: pytest. monkeypatch.setattr(mod.ssrf_proxy, "head", fake_head) monkeypatch.setattr(mod.ssrf_proxy, "get", fake_get) - monkeypatch.setattr(mod.chardet, "detect", lambda b: {"encoding": "utf-8"}) + mock_best = SimpleNamespace(encoding="utf-8") + mock_from_bytes = SimpleNamespace(best=lambda: mock_best) + monkeypatch.setattr(mod.charset_normalizer, "from_bytes", lambda _: mock_from_bytes) # readability returns empty plain_text monkeypatch.setattr(mod, "simple_json_from_html_string", lambda html, use_readability=True: {"plain_text": []}) @@ -162,7 +169,9 @@ def test_get_url_403_cloudscraper_fallback(monkeypatch: pytest.MonkeyPatch, stub monkeypatch.setattr(mod.ssrf_proxy, "head", fake_head) monkeypatch.setattr(mod.cloudscraper, "create_scraper", lambda: FakeScraper()) - monkeypatch.setattr(mod.chardet, "detect", lambda b: {"encoding": "utf-8"}) + mock_best = SimpleNamespace(encoding="utf-8") + mock_from_bytes = SimpleNamespace(best=lambda: mock_best) + monkeypatch.setattr(mod.charset_normalizer, "from_bytes", lambda _: mock_from_bytes) monkeypatch.setattr( mod, "simple_json_from_html_string", @@ -234,7 +243,10 @@ def test_get_url_html_encoding_fallback_when_decode_fails(monkeypatch: pytest.Mo monkeypatch.setattr(mod.ssrf_proxy, "head", fake_head) monkeypatch.setattr(mod.ssrf_proxy, "get", fake_get) - monkeypatch.setattr(mod.chardet, "detect", lambda b: {"encoding": "utf-8"}) + + mock_best = SimpleNamespace(encoding="utf-8") + mock_from_bytes = SimpleNamespace(best=lambda: mock_best) + monkeypatch.setattr(mod.charset_normalizer, "from_bytes", lambda _: mock_from_bytes) monkeypatch.setattr( mod, "simple_json_from_html_string", diff --git a/api/uv.lock b/api/uv.lock index e36e3e9b5f..68ff250bce 100644 --- a/api/uv.lock +++ b/api/uv.lock @@ -1348,7 +1348,7 @@ dependencies = [ { name = "bs4" }, { name = "cachetools" }, { name = "celery" }, - { name = "chardet" }, + { name = "charset-normalizer" }, { name = "croniter" }, { name = "flask" }, { name = "flask-compress" }, @@ -1544,7 +1544,7 @@ requires-dist = [ { name = "bs4", specifier = "~=0.0.1" }, { name = "cachetools", specifier = "~=5.3.0" }, { name = "celery", specifier = "~=5.5.2" }, - { name = "chardet", specifier = "~=5.1.0" }, + { name = "charset-normalizer", specifier = ">=3.4.4" }, { name = "croniter", specifier = ">=6.0.0" }, { name = "flask", specifier = "~=3.1.2" }, { name = "flask-compress", specifier = ">=1.17,<1.18" },