fix: MD5 and 8‑hex Suffix Collision Risk

fix(api): sync api/uv.lock with main to resolve binary diff
2025-12-06 19:42:42 +08:00 · 2025-09-24 17:01:23 +08:00 · 2025-09-24 12:00:50 +08:00 · 2025-09-24 11:58:07 +08:00 · 2025-09-23 16:41:46 +08:00 · 2025-09-23 00:12:54 +08:00
52 changed files with 3226 additions and 2362 deletions
--- a/api/.env.example
+++ b/api/.env.example
@@ -156,7 +156,7 @@ WEB_API_CORS_ALLOW_ORIGINS=http://localhost:3000,*
 CONSOLE_CORS_ALLOW_ORIGINS=http://localhost:3000,*

 # Vector database configuration
-# Supported values are `weaviate`, `qdrant`, `milvus`, `myscale`, `relyt`, `pgvector`, `pgvecto-rs`, `chroma`, `opensearch`, `oracle`, `tencent`, `elasticsearch`, `elasticsearch-ja`, `analyticdb`, `couchbase`, `vikingdb`, `oceanbase`, `opengauss`, `tablestore`,`vastbase`,`tidb`,`tidb_on_qdrant`,`baidu`,`lindorm`,`huawei_cloud`,`upstash`, `matrixone`.
+# Supported values are `weaviate`, `qdrant`, `milvus`, `myscale`, `relyt`, `pgvector`, `pgvecto-rs`, `chroma`, `opensearch`, `oracle`, `tencent`, `elasticsearch`, `elasticsearch-ja`, `analyticdb`, `couchbase`, `vikingdb`, `oceanbase`, `opengauss`, `tablestore`,`vastbase`,`tidb`,`tidb_on_qdrant`,`baidu`,`lindorm`,`huawei_cloud`,`upstash`, `matrixone`, `pinecone`.
 VECTOR_STORE=weaviate
 # Prefix used to create collection name in vector database
 VECTOR_INDEX_NAME_PREFIX=Vector_index
@@ -361,6 +361,17 @@ PROMPT_GENERATION_MAX_TOKENS=512
 CODE_GENERATION_MAX_TOKENS=1024
 PLUGIN_BASED_TOKEN_COUNTING_ENABLED=false

+
+# Pinecone configuration, only available when VECTOR_STORE is `pinecone`
+PINECONE_API_KEY=your-pinecone-api-key
+PINECONE_ENVIRONMENT=your-pinecone-environment
+PINECONE_INDEX_NAME=dify-index
+PINECONE_CLIENT_TIMEOUT=30
+PINECONE_BATCH_SIZE=100
+PINECONE_METRIC=cosine
+PINECONE_PODS=1
+PINECONE_POD_TYPE=s1
+
 # Mail configuration, support: resend, smtp, sendgrid
 MAIL_TYPE=
 # If using SendGrid, use the 'from' field for authentication if necessary.
--- a/api/configs/middleware/init.py
+++ b/api/configs/middleware/init.py
@@ -35,6 +35,7 @@ from .vdb.opensearch_config import OpenSearchConfig
 from .vdb.oracle_config import OracleConfig
 from .vdb.pgvector_config import PGVectorConfig
 from .vdb.pgvectors_config import PGVectoRSConfig
+from .vdb.pinecone_config import PineconeConfig
 from .vdb.qdrant_config import QdrantConfig
 from .vdb.relyt_config import RelytConfig
 from .vdb.tablestore_config import TableStoreConfig
@@ -331,6 +332,7 @@ class MiddlewareConfig(
    PGVectorConfig,
    VastbaseVectorConfig,
    PGVectoRSConfig,
+    PineconeConfig,
    QdrantConfig,
    RelytConfig,
    TencentVectorDBConfig,
--- a/api/configs/middleware/vdb/pinecone_config.py
+++ b/api/configs/middleware/vdb/pinecone_config.py
@@ -0,0 +1,41 @@
+from typing import Optional
+
+from pydantic import Field, PositiveInt
+from pydantic_settings import BaseSettings
+
+
+class PineconeConfig(BaseSettings):
+    """
+    Configuration settings for Pinecone vector database
+    """
+
+    PINECONE_API_KEY: Optional[str] = Field(
+        description="API key for authenticating with Pinecone service",
+        default=None,
+    )
+
+    PINECONE_ENVIRONMENT: Optional[str] = Field(
+        description="Pinecone environment (e.g., 'us-west1-gcp', 'us-east-1-aws')",
+        default=None,
+    )
+
+    PINECONE_INDEX_NAME: Optional[str] = Field(
+        description="Default Pinecone index name",
+        default=None,
+    )
+
+    PINECONE_CLIENT_TIMEOUT: PositiveInt = Field(
+        description="Timeout in seconds for Pinecone client operations (default is 30 seconds)",
+        default=30,
+    )
+
+    PINECONE_BATCH_SIZE: PositiveInt = Field(
+        description="Batch size for Pinecone operations (default is 100)",
+        default=100,
+    )
+
+    PINECONE_METRIC: str = Field(
+        description="Distance metric for Pinecone index (cosine, euclidean, dotproduct)",
+        default="cosine",
+    )
+
--- a/api/controllers/console/datasets/datasets.py
+++ b/api/controllers/console/datasets/datasets.py
@@ -660,6 +660,7 @@ class DatasetRetrievalSettingApi(Resource):
                | VectorType.BAIDU
                | VectorType.VIKINGDB
                | VectorType.UPSTASH
+                | VectorType.PINECONE
            ):
                return {"retrieval_method": [RetrievalMethod.SEMANTIC_SEARCH.value]}
            case (
@@ -711,6 +712,7 @@ class DatasetRetrievalSettingMockApi(Resource):
                | VectorType.BAIDU
                | VectorType.VIKINGDB
                | VectorType.UPSTASH
+                | VectorType.PINECONE
            ):
                return {"retrieval_method": [RetrievalMethod.SEMANTIC_SEARCH.value]}
            case (
--- a/api/core/rag/datasource/retrieval_service.py
+++ b/api/core/rag/datasource/retrieval_service.py
@@ -24,7 +24,7 @@ default_retrieval_model = {
    "search_method": RetrievalMethod.SEMANTIC_SEARCH.value,
    "reranking_enable": False,
    "reranking_model": {"reranking_provider_name": "", "reranking_model_name": ""},
-    "top_k": 2,
+    "top_k": 4,
    "score_threshold_enabled": False,
 }

--- a/api/core/rag/datasource/vdb/analyticdb/analyticdb_vector_openapi.py
+++ b/api/core/rag/datasource/vdb/analyticdb/analyticdb_vector_openapi.py
@@ -256,7 +256,7 @@ class AnalyticdbVectorOpenAPI:
        response = self._client.query_collection_data(request)
        documents = []
        for match in response.body.matches.match:
-            if match.score > score_threshold:
+            if match.score >= score_threshold:
                metadata = json.loads(match.metadata.get("metadata_"))
                metadata["score"] = match.score
                doc = Document(
@@ -293,7 +293,7 @@ class AnalyticdbVectorOpenAPI:
        response = self._client.query_collection_data(request)
        documents = []
        for match in response.body.matches.match:
-            if match.score > score_threshold:
+            if match.score >= score_threshold:
                metadata = json.loads(match.metadata.get("metadata_"))
                metadata["score"] = match.score
                doc = Document(
--- a/api/core/rag/datasource/vdb/analyticdb/analyticdb_vector_sql.py
+++ b/api/core/rag/datasource/vdb/analyticdb/analyticdb_vector_sql.py
@@ -229,7 +229,7 @@ class AnalyticdbVectorBySql:
            documents = []
            for record in cur:
                id, vector, score, page_content, metadata = record
-                if score > score_threshold:
+                if score >= score_threshold:
                    metadata["score"] = score
                    doc = Document(
                        page_content=page_content,
--- a/api/core/rag/datasource/vdb/baidu/baidu_vector.py
+++ b/api/core/rag/datasource/vdb/baidu/baidu_vector.py
@@ -157,7 +157,7 @@ class BaiduVector(BaseVector):
            if meta is not None:
                meta = json.loads(meta)
            score = row.get("score", 0.0)
-            if score > score_threshold:
+            if score >= score_threshold:
                meta["score"] = score
                doc = Document(page_content=row_data.get(self.field_text), metadata=meta)
                docs.append(doc)
--- a/api/core/rag/datasource/vdb/chroma/chroma_vector.py
+++ b/api/core/rag/datasource/vdb/chroma/chroma_vector.py
@@ -120,7 +120,7 @@ class ChromaVector(BaseVector):
            distance = distances[index]
            metadata = dict(metadatas[index])
            score = 1 - distance
-            if score > score_threshold:
+            if score >= score_threshold:
                metadata["score"] = score
                doc = Document(
                    page_content=documents[index],
--- a/api/core/rag/datasource/vdb/couchbase/couchbase_vector.py
+++ b/api/core/rag/datasource/vdb/couchbase/couchbase_vector.py
@@ -304,7 +304,7 @@ class CouchbaseVector(BaseVector):
        return docs

    def search_by_full_text(self, query: str, **kwargs: Any) -> list[Document]:
-        top_k = kwargs.get("top_k", 2)
+        top_k = kwargs.get("top_k", 4)
        try:
            CBrequest = search.SearchRequest.create(search.QueryStringQuery("text:" + query))
            search_iter = self._scope.search(
--- a/api/core/rag/datasource/vdb/elasticsearch/elasticsearch_vector.py
+++ b/api/core/rag/datasource/vdb/elasticsearch/elasticsearch_vector.py
@@ -216,7 +216,7 @@ class ElasticSearchVector(BaseVector):
        docs = []
        for doc, score in docs_and_scores:
            score_threshold = float(kwargs.get("score_threshold") or 0.0)
-            if score > score_threshold:
+            if score >= score_threshold:
                if doc.metadata is not None:
                    doc.metadata["score"] = score
                    docs.append(doc)
--- a/api/core/rag/datasource/vdb/huawei/huawei_cloud_vector.py
+++ b/api/core/rag/datasource/vdb/huawei/huawei_cloud_vector.py
@@ -127,7 +127,7 @@ class HuaweiCloudVector(BaseVector):
        docs = []
        for doc, score in docs_and_scores:
            score_threshold = float(kwargs.get("score_threshold") or 0.0)
-            if score > score_threshold:
+            if score >= score_threshold:
                if doc.metadata is not None:
                    doc.metadata["score"] = score
            docs.append(doc)
--- a/api/core/rag/datasource/vdb/lindorm/lindorm_vector.py
+++ b/api/core/rag/datasource/vdb/lindorm/lindorm_vector.py
@@ -275,7 +275,7 @@ class LindormVectorStore(BaseVector):
        docs = []
        for doc, score in docs_and_scores:
            score_threshold = kwargs.get("score_threshold", 0.0) or 0.0
-            if score > score_threshold:
+            if score >= score_threshold:
                if doc.metadata is not None:
                    doc.metadata["score"] = score
                docs.append(doc)
--- a/api/core/rag/datasource/vdb/opengauss/opengauss.py
+++ b/api/core/rag/datasource/vdb/opengauss/opengauss.py
@@ -194,7 +194,7 @@ class OpenGauss(BaseVector):
                metadata, text, distance = record
                score = 1 - distance
                metadata["score"] = score
-                if score > score_threshold:
+                if score >= score_threshold:
                    docs.append(Document(page_content=text, metadata=metadata))
        return docs

--- a/api/core/rag/datasource/vdb/opensearch/opensearch_vector.py
+++ b/api/core/rag/datasource/vdb/opensearch/opensearch_vector.py
@@ -211,7 +211,7 @@ class OpenSearchVector(BaseVector):

            metadata["score"] = hit["_score"]
            score_threshold = float(kwargs.get("score_threshold") or 0.0)
-            if hit["_score"] > score_threshold:
+            if hit["_score"] >= score_threshold:
                doc = Document(page_content=hit["_source"].get(Field.CONTENT_KEY.value), metadata=metadata)
                docs.append(doc)

--- a/api/core/rag/datasource/vdb/oracle/oraclevector.py
+++ b/api/core/rag/datasource/vdb/oracle/oraclevector.py
@@ -261,7 +261,7 @@ class OracleVector(BaseVector):
                    metadata, text, distance = record
                    score = 1 - distance
                    metadata["score"] = score
-                    if score > score_threshold:
+                    if score >= score_threshold:
                        docs.append(Document(page_content=text, metadata=metadata))
            conn.close()
        return docs
--- a/api/core/rag/datasource/vdb/pgvecto_rs/pgvecto_rs.py
+++ b/api/core/rag/datasource/vdb/pgvecto_rs/pgvecto_rs.py
@@ -202,7 +202,7 @@ class PGVectoRS(BaseVector):
            score = 1 - dis
            metadata["score"] = score
            score_threshold = float(kwargs.get("score_threshold") or 0.0)
-            if score > score_threshold:
+            if score >= score_threshold:
                doc = Document(page_content=record.text, metadata=metadata)
                docs.append(doc)
        return docs
--- a/api/core/rag/datasource/vdb/pgvector/pgvector.py
+++ b/api/core/rag/datasource/vdb/pgvector/pgvector.py
@@ -195,7 +195,7 @@ class PGVector(BaseVector):
                metadata, text, distance = record
                score = 1 - distance
                metadata["score"] = score
-                if score > score_threshold:
+                if score >= score_threshold:
                    docs.append(Document(page_content=text, metadata=metadata))
        return docs

--- a/api/core/rag/datasource/vdb/pinecone/init.py
+++ b/api/core/rag/datasource/vdb/pinecone/init.py
--- a/api/core/rag/datasource/vdb/pinecone/pinecone_vector.py
+++ b/api/core/rag/datasource/vdb/pinecone/pinecone_vector.py
@@ -0,0 +1,341 @@
+import json
+import time
+from typing import Any, Optional
+
+from pinecone import Pinecone, ServerlessSpec
+from pydantic import BaseModel
+
+from configs import dify_config
+from core.rag.datasource.vdb.field import Field
+from core.rag.datasource.vdb.vector_base import BaseVector
+from core.rag.datasource.vdb.vector_factory import AbstractVectorFactory
+from core.rag.datasource.vdb.vector_type import VectorType
+from core.rag.embedding.embedding_base import Embeddings
+from core.rag.models.document import Document
+from extensions.ext_database import db
+from extensions.ext_redis import redis_client
+from models.dataset import Dataset, DatasetCollectionBinding
+
+
+class PineconeConfig(BaseModel):
+    """Pinecone configuration class"""
+    api_key: str
+    environment: str
+    index_name: Optional[str] = None
+    timeout: float = 30
+    batch_size: int = 100
+    metric: str = "cosine"
+
+
+class PineconeVector(BaseVector):
+    """Pinecone vector database concrete implementation class"""
+
+    def __init__(self, collection_name: str, group_id: str, config: PineconeConfig):
+        super().__init__(collection_name)
+        self._client_config = config
+        self._group_id = group_id
+
+        # Initialize Pinecone client with SSL configuration
+        try:
+            self._pc = Pinecone(
+                api_key=config.api_key,
+                # Configure SSL to handle connection issues
+                ssl_ca_certs=None,  # Use system default CA certificates
+            )
+        except Exception as e:
+            # Fallback to basic initialization if SSL config fails
+            self._pc = Pinecone(api_key=config.api_key)
+
+        # Normalize index name: lowercase, only a-z0-9- and <=45 chars
+        import re, hashlib
+        base_name = collection_name.lower()
+        base_name = re.sub(r'[^a-z0-9-]+', '-', base_name)  # replace invalid chars with '-'
+        base_name = re.sub(r'-+', '-', base_name).strip('-')
+        # Use longer secure suffix to reduce collision risk
+        suffix_len = 24  # 24 hex digits (96-bit entropy)
+        if len(base_name) > 45:
+            hash_suffix = hashlib.sha256(base_name.encode()).hexdigest()[:suffix_len]
+            truncated_name = base_name[:45-(suffix_len+1)].rstrip('-')
+            self._index_name = f"{truncated_name}-{hash_suffix}"
+        else:
+            self._index_name = base_name
+        # Guard empty name
+        if not self._index_name:
+            self._index_name = f"index-{hashlib.sha256(collection_name.encode()).hexdigest()[:suffix_len]}"
+        self._index = None
+        
+    def get_type(self) -> str:
+        """Return vector database type identifier"""
+        return "pinecone"
+
+    def _ensure_index_initialized(self) -> None:
+        """Ensure that self._index is attached to an existing Pinecone index."""
+        if self._index is not None:
+            return
+        try:
+            existing_indexes = self._pc.list_indexes().names()
+            if self._index_name in existing_indexes:
+                self._index = self._pc.Index(self._index_name)
+            else:
+                raise ValueError("Index not initialized. Please ingest documents to create index.")
+        except Exception:
+            raise 
+
+    def to_index_struct(self) -> dict:
+        """Generate index structure dictionary"""
+        return {
+            "type": self.get_type(),
+            "vector_store": {"class_prefix": self._collection_name}
+        }
+
+    def create(self, texts: list[Document], embeddings: list[list[float]], **kwargs):
+        """Create vector index"""
+        if texts:
+            # Get vector dimension
+            vector_size = len(embeddings[0])
+
+            # Create Pinecone index
+            self.create_index(vector_size)
+
+            # Add vector data
+            self.add_texts(texts, embeddings, **kwargs)
+
+    def create_index(self, dimension: int):
+        """Create Pinecone index"""
+        lock_name = f"vector_indexing_lock_{self._index_name}"
+
+        with redis_client.lock(lock_name, timeout=30):
+            # Check Redis cache
+            index_exist_cache_key = f"vector_indexing_{self._index_name}"
+            if redis_client.get(index_exist_cache_key):
+                self._index = self._pc.Index(self._index_name)
+                return
+
+            # Check if index already exists
+            existing_indexes = self._pc.list_indexes().names()
+
+            if self._index_name not in existing_indexes:
+                # Create new index using ServerlessSpec
+                self._pc.create_index(
+                    name=self._index_name,
+                    dimension=dimension,
+                    metric=self._client_config.metric,
+                    spec=ServerlessSpec(
+                        cloud='aws',
+                        region=self._client_config.environment
+                    )
+                )
+
+                # Wait for index creation to complete
+                while not self._pc.describe_index(self._index_name).status['ready']:
+                    time.sleep(1)
+            else:
+                # Get index instance
+                self._index = self._pc.Index(self._index_name)
+
+            # Set cache
+            redis_client.set(index_exist_cache_key, 1, ex=3600)
+    
+    def add_texts(self, documents: list[Document], embeddings: list[list[float]], **kwargs):
+        """Batch add document vectors"""
+        if not self._index:
+            raise ValueError("Index not initialized. Call create() first.")
+
+        total_docs = len(documents)
+
+        uuids = self._get_uuids(documents)
+        batch_size = self._client_config.batch_size
+        added_ids = []
+
+        # Batch processing
+        total_batches = (total_docs + batch_size - 1) // batch_size  # Ceiling division
+        for batch_idx, i in enumerate(range(0, len(documents), batch_size), 1):
+            batch_documents = documents[i:i + batch_size]
+            batch_embeddings = embeddings[i:i + batch_size]
+            batch_uuids = uuids[i:i + batch_size]
+            batch_size_actual = len(batch_documents)
+
+            # Build Pinecone vector data (metadata must be primitives or list[str])
+            vectors_to_upsert = []
+            for doc, embedding, doc_id in zip(batch_documents, batch_embeddings, batch_uuids):
+                raw_meta = doc.metadata or {}
+                safe_meta: dict[str, Any] = {}
+                # lift common identifiers to top-level fields for filtering
+                for k, v in raw_meta.items():
+                    if isinstance(v, (str, int, float, bool)):
+                        safe_meta[k] = v
+                    elif isinstance(v, list) and all(isinstance(x, str) for x in v):
+                        safe_meta[k] = v
+                    else:
+                        safe_meta[k] = json.dumps(v, ensure_ascii=False)
+
+                # keep content as string metadata if needed
+                safe_meta[Field.CONTENT_KEY.value] = doc.page_content
+                # group id as string
+                safe_meta[Field.GROUP_KEY.value] = str(self._group_id)
+
+                vectors_to_upsert.append({
+                    "id": doc_id,
+                    "values": embedding,
+                    "metadata": safe_meta
+                })
+
+            # Batch insert to Pinecone
+            try:
+                self._index.upsert(vectors=vectors_to_upsert)
+                added_ids.extend(batch_uuids)
+            except Exception as e:
+                raise
+
+        return added_ids
+    
+    def search_by_vector(self, query_vector: list[float], **kwargs) -> list[Document]:
+        """Vector similarity search"""
+        # Lazily attach to an existing index if needed
+        self._ensure_index_initialized()
+
+        top_k = kwargs.get("top_k", 4)
+        score_threshold = float(kwargs.get("score_threshold", 0.0))
+
+        # Build filter conditions
+        filter_dict = {Field.GROUP_KEY.value: {"$eq": str(self._group_id)}}
+
+        # Document scope filtering
+        document_ids_filter = kwargs.get("document_ids_filter")
+        if document_ids_filter:
+            filter_dict["document_id"] = {"$in": document_ids_filter}
+
+        # Execute search
+        try:
+            response = self._index.query(
+                vector=query_vector,
+                top_k=top_k,
+                include_metadata=True,
+                filter=filter_dict
+            )
+        except Exception as e:
+            raise
+
+        # Convert results
+        docs = []
+        filtered_count = 0
+        for match in response.matches:
+            if match.score >= score_threshold:
+                page_content = match.metadata.get(Field.CONTENT_KEY.value, "")
+                metadata = dict(match.metadata or {})
+                metadata.pop(Field.CONTENT_KEY.value, None)
+                metadata.pop(Field.GROUP_KEY.value, None)
+                metadata["score"] = match.score
+
+                doc = Document(page_content=page_content, metadata=metadata)
+                docs.append(doc)
+            else:
+                filtered_count += 1
+
+        # Sort by similarity score in descending order
+        docs.sort(key=lambda x: x.metadata.get("score", 0), reverse=True)
+
+        return docs
+    
+    def search_by_full_text(self, query: str, **kwargs) -> list[Document]:
+        """Full-text search - Pinecone does not natively support it, returns empty list"""
+        return []
+    
+    def delete_by_metadata_field(self, key: str, value: str):
+        """Delete by metadata field"""
+        self._ensure_index_initialized()
+
+        try:
+            # Build filter conditions
+            filter_dict = {
+                Field.GROUP_KEY.value: {"$eq": self._group_id},
+                f"{Field.METADATA_KEY.value}.{key}": {"$eq": value}
+            }
+
+            # Pinecone delete operation
+            self._index.delete(filter=filter_dict)
+        except Exception as e:
+            # Ignore delete errors
+            pass
+    
+    def delete_by_ids(self, ids: list[str]) -> None:
+        """Batch delete by ID list"""
+        self._ensure_index_initialized()
+
+        try:
+            # Pinecone delete by ID
+            self._index.delete(ids=ids)
+        except Exception as e:
+            raise
+    
+    def delete(self) -> None:
+        """Delete all vector data for the entire dataset"""
+        self._ensure_index_initialized()
+
+        try:
+            # Delete all vectors by group_id
+            filter_dict = {Field.GROUP_KEY.value: {"$eq": self._group_id}}
+            self._index.delete(filter=filter_dict)
+        except Exception as e:
+            raise
+    
+    def text_exists(self, id: str) -> bool:
+        """Check if document exists"""
+        try:
+            self._ensure_index_initialized()
+        except Exception:
+            return False
+
+        try:
+            # Check if vector exists through query
+            response = self._index.fetch(ids=[id])
+            exists = id in response.vectors
+            return exists
+        except Exception as e:
+            return False
+
+
+class PineconeVectorFactory(AbstractVectorFactory):
+    """Pinecone vector database factory class"""
+    
+    def init_vector(self, dataset: Dataset, attributes: list, embeddings: Embeddings) -> PineconeVector:
+        """Create PineconeVector instance"""
+        
+        # Determine index name
+        if dataset.collection_binding_id:
+            dataset_collection_binding = (
+                db.session.query(DatasetCollectionBinding)
+                .where(DatasetCollectionBinding.id == dataset.collection_binding_id)
+                .one_or_none()
+            )
+            if dataset_collection_binding:
+                collection_name = dataset_collection_binding.collection_name
+            else:
+                raise ValueError("Dataset Collection Bindings does not exist!")
+        else:
+            if dataset.index_struct_dict:
+                class_prefix: str = dataset.index_struct_dict["vector_store"]["class_prefix"]
+                collection_name = class_prefix
+            else:
+                dataset_id = dataset.id
+                collection_name = Dataset.gen_collection_name_by_id(dataset_id)
+        
+        # Set index structure
+        if not dataset.index_struct_dict:
+            dataset.index_struct = json.dumps(
+                self.gen_index_struct_dict("pinecone", collection_name)
+            )
+        
+        # Create PineconeVector instance
+        return PineconeVector(
+            collection_name=collection_name,
+            group_id=dataset.id,
+            config=PineconeConfig(
+                api_key=dify_config.PINECONE_API_KEY or "",
+                environment=dify_config.PINECONE_ENVIRONMENT or "",
+                index_name=dify_config.PINECONE_INDEX_NAME,
+                timeout=dify_config.PINECONE_CLIENT_TIMEOUT,
+                batch_size=dify_config.PINECONE_BATCH_SIZE,
+                metric=dify_config.PINECONE_METRIC,
+            ),
+        )
--- a/api/core/rag/datasource/vdb/pyvastbase/vastbase_vector.py
+++ b/api/core/rag/datasource/vdb/pyvastbase/vastbase_vector.py
@@ -170,7 +170,7 @@ class VastbaseVector(BaseVector):
                metadata, text, distance = record
                score = 1 - distance
                metadata["score"] = score
-                if score > score_threshold:
+                if score >= score_threshold:
                    docs.append(Document(page_content=text, metadata=metadata))
        return docs

--- a/api/core/rag/datasource/vdb/qdrant/qdrant_vector.py
+++ b/api/core/rag/datasource/vdb/qdrant/qdrant_vector.py
@@ -369,7 +369,7 @@ class QdrantVector(BaseVector):
                continue
            metadata = result.payload.get(Field.METADATA_KEY.value) or {}
            # duplicate check score threshold
-            if result.score > score_threshold:
+            if result.score >= score_threshold:
                metadata["score"] = result.score
                doc = Document(
                    page_content=result.payload.get(Field.CONTENT_KEY.value, ""),
--- a/api/core/rag/datasource/vdb/relyt/relyt_vector.py
+++ b/api/core/rag/datasource/vdb/relyt/relyt_vector.py
@@ -233,7 +233,7 @@ class RelytVector(BaseVector):
        docs = []
        for document, score in results:
            score_threshold = float(kwargs.get("score_threshold") or 0.0)
-            if 1 - score > score_threshold:
+            if 1 - score >= score_threshold:
                docs.append(document)
        return docs

--- a/api/core/rag/datasource/vdb/tablestore/tablestore_vector.py
+++ b/api/core/rag/datasource/vdb/tablestore/tablestore_vector.py
@@ -300,7 +300,7 @@ class TableStoreVector(BaseVector):
        )
        documents = []
        for search_hit in search_response.search_hits:
-            if search_hit.score > score_threshold:
+            if search_hit.score >= score_threshold:
                ots_column_map = {}
                for col in search_hit.row[1]:
                    ots_column_map[col[0]] = col[1]
--- a/api/core/rag/datasource/vdb/tencent/tencent_vector.py
+++ b/api/core/rag/datasource/vdb/tencent/tencent_vector.py
@@ -291,7 +291,7 @@ class TencentVector(BaseVector):
                score = 1 - result.get("score", 0.0)
            else:
                score = result.get("score", 0.0)
-            if score > score_threshold:
+            if score >= score_threshold:
                meta["score"] = score
                doc = Document(page_content=result.get(self.field_text), metadata=meta)
                docs.append(doc)
--- a/api/core/rag/datasource/vdb/tidb_on_qdrant/tidb_on_qdrant_vector.py
+++ b/api/core/rag/datasource/vdb/tidb_on_qdrant/tidb_on_qdrant_vector.py
@@ -351,7 +351,7 @@ class TidbOnQdrantVector(BaseVector):
            metadata = result.payload.get(Field.METADATA_KEY.value) or {}
            # duplicate check score threshold
            score_threshold = kwargs.get("score_threshold") or 0.0
-            if result.score > score_threshold:
+            if result.score >= score_threshold:
                metadata["score"] = result.score
                doc = Document(
                    page_content=result.payload.get(Field.CONTENT_KEY.value, ""),
--- a/api/core/rag/datasource/vdb/upstash/upstash_vector.py
+++ b/api/core/rag/datasource/vdb/upstash/upstash_vector.py
@@ -110,7 +110,7 @@ class UpstashVector(BaseVector):
            score = record.score
            if metadata is not None and text is not None:
                metadata["score"] = score
-                if score > score_threshold:
+                if score >= score_threshold:
                    docs.append(Document(page_content=text, metadata=metadata))
        return docs

--- a/api/core/rag/datasource/vdb/vector_factory.py
+++ b/api/core/rag/datasource/vdb/vector_factory.py
@@ -86,6 +86,10 @@ class Vector:
                from core.rag.datasource.vdb.pgvecto_rs.pgvecto_rs import PGVectoRSFactory

                return PGVectoRSFactory
+            case VectorType.PINECONE:
+                from core.rag.datasource.vdb.pinecone.pinecone_vector import PineconeVectorFactory
+
+                return PineconeVectorFactory
            case VectorType.QDRANT:
                from core.rag.datasource.vdb.qdrant.qdrant_vector import QdrantVectorFactory

--- a/api/core/rag/datasource/vdb/vector_type.py
+++ b/api/core/rag/datasource/vdb/vector_type.py
@@ -31,3 +31,4 @@ class VectorType(StrEnum):
    HUAWEI_CLOUD = "huawei_cloud"
    MATRIXONE = "matrixone"
    CLICKZETTA = "clickzetta"
+    PINECONE = "pinecone"
--- a/api/core/rag/datasource/vdb/vikingdb/vikingdb_vector.py
+++ b/api/core/rag/datasource/vdb/vikingdb/vikingdb_vector.py
@@ -192,7 +192,7 @@ class VikingDBVector(BaseVector):
            metadata = result.fields.get(vdb_Field.METADATA_KEY.value)
            if metadata is not None:
                metadata = json.loads(metadata)
-            if result.score > score_threshold:
+            if result.score >= score_threshold:
                metadata["score"] = result.score
                doc = Document(page_content=result.fields.get(vdb_Field.CONTENT_KEY.value), metadata=metadata)
                docs.append(doc)
--- a/api/core/rag/datasource/vdb/weaviate/weaviate_vector.py
+++ b/api/core/rag/datasource/vdb/weaviate/weaviate_vector.py
@@ -220,7 +220,7 @@ class WeaviateVector(BaseVector):
        for doc, score in docs_and_scores:
            score_threshold = float(kwargs.get("score_threshold") or 0.0)
            # check score threshold
-            if score > score_threshold:
+            if score >= score_threshold:
                if doc.metadata is not None:
                    doc.metadata["score"] = score
                    docs.append(doc)
--- a/api/core/rag/extractor/excel_extractor.py
+++ b/api/core/rag/extractor/excel_extractor.py
@@ -10,6 +10,23 @@ from core.rag.extractor.extractor_base import BaseExtractor
 from core.rag.models.document import Document


+def _format_cell_value(value) -> str:
+    if pd.isna(value):
+        return ""
+    
+    if isinstance(value, (int, float)):
+        if isinstance(value, float):
+            if value.is_integer():
+                return str(int(value))
+            else:
+                formatted = f"{value:f}"
+                return formatted.rstrip('0').rstrip('.')
+        else:
+            return str(value)
+    
+    return str(value)
+
+
 class ExcelExtractor(BaseExtractor):
    """Load Excel files.

@@ -49,10 +66,12 @@ class ExcelExtractor(BaseExtractor):
                                row=cast(int, index) + 2, column=col_index + 1
                            )  # +2 to account for header and 1-based index
                            if cell.hyperlink:
-                                value = f"[{v}]({cell.hyperlink.target})"
+                                formatted_v = _format_cell_value(v)
+                                value = f"[{formatted_v}]({cell.hyperlink.target})"
                                page_content.append(f'"{k}":"{value}"')
                            else:
-                                page_content.append(f'"{k}":"{v}"')
+                                formatted_v = _format_cell_value(v)
+                                page_content.append(f'"{k}":"{formatted_v}"')
                    documents.append(
                        Document(page_content=";".join(page_content), metadata={"source": self._file_path})
                    )
@@ -67,7 +86,8 @@ class ExcelExtractor(BaseExtractor):
                    page_content = []
                    for k, v in row.items():
                        if pd.notna(v):
-                            page_content.append(f'"{k}":"{v}"')
+                            formatted_v = _format_cell_value(v)
+                            page_content.append(f'"{k}":"{formatted_v}"')
                    documents.append(
                        Document(page_content=";".join(page_content), metadata={"source": self._file_path})
                    )
--- a/api/core/rag/index_processor/processor/paragraph_index_processor.py
+++ b/api/core/rag/index_processor/processor/paragraph_index_processor.py
@@ -123,7 +123,7 @@ class ParagraphIndexProcessor(BaseIndexProcessor):
        for result in results:
            metadata = result.metadata
            metadata["score"] = result.score
-            if result.score > score_threshold:
+            if result.score >= score_threshold:
                doc = Document(page_content=result.page_content, metadata=metadata)
                docs.append(doc)
        return docs
--- a/api/core/rag/index_processor/processor/parent_child_index_processor.py
+++ b/api/core/rag/index_processor/processor/parent_child_index_processor.py
@@ -162,7 +162,7 @@ class ParentChildIndexProcessor(BaseIndexProcessor):
        for result in results:
            metadata = result.metadata
            metadata["score"] = result.score
-            if result.score > score_threshold:
+            if result.score >= score_threshold:
                doc = Document(page_content=result.page_content, metadata=metadata)
                docs.append(doc)
        return docs
--- a/api/core/rag/index_processor/processor/qa_index_processor.py
+++ b/api/core/rag/index_processor/processor/qa_index_processor.py
@@ -158,7 +158,7 @@ class QAIndexProcessor(BaseIndexProcessor):
        for result in results:
            metadata = result.metadata
            metadata["score"] = result.score
-            if result.score > score_threshold:
+            if result.score >= score_threshold:
                doc = Document(page_content=result.page_content, metadata=metadata)
                docs.append(doc)
        return docs
--- a/api/core/rag/retrieval/dataset_retrieval.py
+++ b/api/core/rag/retrieval/dataset_retrieval.py
@@ -65,7 +65,7 @@ default_retrieval_model: dict[str, Any] = {
    "search_method": RetrievalMethod.SEMANTIC_SEARCH.value,
    "reranking_enable": False,
    "reranking_model": {"reranking_provider_name": "", "reranking_model_name": ""},
-    "top_k": 2,
+    "top_k": 4,
    "score_threshold_enabled": False,
 }

@@ -647,7 +647,7 @@ class DatasetRetrieval:
                            retrieval_method=retrieval_model["search_method"],
                            dataset_id=dataset.id,
                            query=query,
-                            top_k=retrieval_model.get("top_k") or 2,
+                            top_k=retrieval_model.get("top_k") or 4,
                            score_threshold=retrieval_model.get("score_threshold", 0.0)
                            if retrieval_model["score_threshold_enabled"]
                            else 0.0,
@@ -743,7 +743,7 @@ class DatasetRetrieval:
            tool = DatasetMultiRetrieverTool.from_dataset(
                dataset_ids=[dataset.id for dataset in available_datasets],
                tenant_id=tenant_id,
-                top_k=retrieve_config.top_k or 2,
+                top_k=retrieve_config.top_k or 4,
                score_threshold=retrieve_config.score_threshold,
                hit_callbacks=[hit_callback],
                return_resource=return_resource,
--- a/api/core/tools/utils/dataset_retriever/dataset_multi_retriever_tool.py
+++ b/api/core/tools/utils/dataset_retriever/dataset_multi_retriever_tool.py
@@ -181,7 +181,7 @@ class DatasetMultiRetrieverTool(DatasetRetrieverBaseTool):
                    retrieval_method="keyword_search",
                    dataset_id=dataset.id,
                    query=query,
-                    top_k=retrieval_model.get("top_k") or 2,
+                    top_k=retrieval_model.get("top_k") or 4,
                )
                if documents:
                    all_documents.extend(documents)
@@ -192,7 +192,7 @@ class DatasetMultiRetrieverTool(DatasetRetrieverBaseTool):
                        retrieval_method=retrieval_model["search_method"],
                        dataset_id=dataset.id,
                        query=query,
-                        top_k=retrieval_model.get("top_k") or 2,
+                        top_k=retrieval_model.get("top_k") or 4,
                        score_threshold=retrieval_model.get("score_threshold", 0.0)
                        if retrieval_model["score_threshold_enabled"]
                        else 0.0,
--- a/api/core/tools/utils/dataset_retriever/dataset_retriever_base_tool.py
+++ b/api/core/tools/utils/dataset_retriever/dataset_retriever_base_tool.py
@@ -13,7 +13,7 @@ class DatasetRetrieverBaseTool(BaseModel, ABC):
    name: str = "dataset"
    description: str = "use this to retrieve a dataset. "
    tenant_id: str
-    top_k: int = 2
+    top_k: int = 4
    score_threshold: Optional[float] = None
    hit_callbacks: list[DatasetIndexToolCallbackHandler] = []
    return_resource: bool
--- a/api/core/workflow/nodes/document_extractor/node.py
+++ b/api/core/workflow/nodes/document_extractor/node.py
@@ -485,6 +485,24 @@ def _extract_text_from_csv(file_content: bytes) -> str:
        raise TextExtractionError(f"Failed to extract text from CSV: {str(e)}") from e


+def _format_cell_value_for_markdown(value) -> str:
+    """格式化单元格值，避免科学计数法"""
+    if pd.isna(value):
+        return ""
+    
+    if isinstance(value, (int, float)):
+        if isinstance(value, float):
+            if value.is_integer():
+                return str(int(value))
+            else:
+                formatted = f"{value:f}"
+                return formatted.rstrip('0').rstrip('.')
+        else:
+            return str(value)
+    
+    return str(value)
+
+
 def _extract_text_from_excel(file_content: bytes) -> str:
    """Extract text from an Excel file using pandas."""

@@ -499,7 +517,8 @@ def _extract_text_from_excel(file_content: bytes) -> str:
        # Construct the data rows
        data_rows = []
        for _, row in df.iterrows():
-            data_row = "| " + " | ".join(map(str, row)) + " |"
+            formatted_row = [_format_cell_value_for_markdown(cell) for cell in row]
+            data_row = "| " + " | ".join(formatted_row) + " |"
            data_rows.append(data_row)

        # Combine all rows into a single string
--- a/api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py
+++ b/api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py
@@ -78,7 +78,7 @@ default_retrieval_model = {
    "search_method": RetrievalMethod.SEMANTIC_SEARCH.value,
    "reranking_enable": False,
    "reranking_model": {"reranking_provider_name": "", "reranking_model_name": ""},
-    "top_k": 2,
+    "top_k": 4,
    "score_threshold_enabled": False,
 }

--- a/api/pyproject.toml
+++ b/api/pyproject.toml
@@ -88,6 +88,7 @@ dependencies = [
    "httpx-sse>=0.4.0",
    "sendgrid~=6.12.3",
    "flask-restx>=1.3.0",
+    "pinecone>=7.3.0",
 ]
 # Before adding new dependency, consider place it in
 # alphabet order (a-z) and suitable group.
--- a/api/services/dataset_service.py
+++ b/api/services/dataset_service.py
@@ -1149,7 +1149,7 @@ class DocumentService:
                        "search_method": RetrievalMethod.SEMANTIC_SEARCH.value,
                        "reranking_enable": False,
                        "reranking_model": {"reranking_provider_name": "", "reranking_model_name": ""},
-                        "top_k": 2,
+                        "top_k": 4,
                        "score_threshold_enabled": False,
                    }

@@ -1612,7 +1612,7 @@ class DocumentService:
                search_method=RetrievalMethod.SEMANTIC_SEARCH.value,
                reranking_enable=False,
                reranking_model=RerankingModel(reranking_provider_name="", reranking_model_name=""),
-                top_k=2,
+                top_k=4,
                score_threshold_enabled=False,
            )
        # save dataset
--- a/api/services/hit_testing_service.py
+++ b/api/services/hit_testing_service.py
@@ -18,7 +18,7 @@ default_retrieval_model = {
    "search_method": RetrievalMethod.SEMANTIC_SEARCH.value,
    "reranking_enable": False,
    "reranking_model": {"reranking_provider_name": "", "reranking_model_name": ""},
-    "top_k": 2,
+    "top_k": 4,
    "score_threshold_enabled": False,
 }

@@ -66,7 +66,7 @@ class HitTestingService:
            retrieval_method=retrieval_model.get("search_method", "semantic_search"),
            dataset_id=dataset.id,
            query=query,
-            top_k=retrieval_model.get("top_k", 2),
+            top_k=retrieval_model.get("top_k", 4),
            score_threshold=retrieval_model.get("score_threshold", 0.0)
            if retrieval_model["score_threshold_enabled"]
            else 0.0,
--- a/api/tests/integration_tests/vdb/pinecone/init.py
+++ b/api/tests/integration_tests/vdb/pinecone/init.py
--- a/api/tests/integration_tests/vdb/pinecone/test_pinecone.py
+++ b/api/tests/integration_tests/vdb/pinecone/test_pinecone.py
@@ -0,0 +1,30 @@
+from core.rag.datasource.vdb.pinecone.pinecone_vector import PineconeConfig, PineconeVector
+from core.rag.models.document import Document
+from tests.integration_tests.vdb.test_vector_store import (
+    AbstractVectorTest,
+    setup_mock_redis,
+)
+
+
+class PineconeVectorTest(AbstractVectorTest):
+    def __init__(self):
+        super().__init__()
+        self.attributes = ["doc_id", "dataset_id", "document_id", "doc_hash"]
+        self.vector = PineconeVector(
+            collection_name=self.collection_name,
+            group_id=self.dataset_id,
+            config=PineconeConfig(
+                api_key="test_api_key",
+                environment="test_environment",
+                index_name="test_index",
+            ),
+        )
+
+    def search_by_vector(self):
+        super().search_by_vector()
+
+
+def test_pinecone_vector(setup_mock_redis):
+
+
+    PineconeVectorTest().run_all_tests()
--- a/api/uv.lock
+++ b/api/uv.lock
--- a/docker/docker-compose.middleware.yaml
+++ b/docker/docker-compose.middleware.yaml
@@ -20,7 +20,17 @@ services:
    ports:
      - "${EXPOSE_POSTGRES_PORT:-5432}:5432"
    healthcheck:
-      test: [ 'CMD', 'pg_isready', '-h', 'db', '-U', '${PGUSER:-postgres}', '-d', '${POSTGRES_DB:-dify}' ]
+      test:
+        [
+          "CMD",
+          "pg_isready",
+          "-h",
+          "db",
+          "-U",
+          "${PGUSER:-postgres}",
+          "-d",
+          "${POSTGRES_DB:-dify}",
+        ]
      interval: 1s
      timeout: 3s
      retries: 30
@@ -41,7 +51,11 @@ services:
    ports:
      - "${EXPOSE_REDIS_PORT:-6379}:6379"
    healthcheck:
-      test: [ 'CMD-SHELL', 'redis-cli -a ${REDIS_PASSWORD:-difyai123456} ping | grep -q PONG' ]
+      test:
+        [
+          "CMD-SHELL",
+          "redis-cli -a ${REDIS_PASSWORD:-difyai123456} ping | grep -q PONG",
+        ]

  # The DifySandbox
  sandbox:
@@ -65,13 +79,13 @@ services:
      - ./volumes/sandbox/dependencies:/dependencies
      - ./volumes/sandbox/conf:/conf
    healthcheck:
-      test: [ "CMD", "curl", "-f", "http://localhost:8194/health" ]
+      test: ["CMD", "curl", "-f", "http://localhost:8194/health"]
    networks:
      - ssrf_proxy_network

  # plugin daemon
  plugin_daemon:
-    image: langgenius/dify-plugin-daemon:0.2.0-local
+    image: langgenius/dify-plugin-daemon:0.3.0-local
    restart: always
    env_file:
      - ./middleware.env
@@ -94,7 +108,6 @@ services:
      PLUGIN_REMOTE_INSTALLING_HOST: ${PLUGIN_DEBUGGING_HOST:-0.0.0.0}
      PLUGIN_REMOTE_INSTALLING_PORT: ${PLUGIN_DEBUGGING_PORT:-5003}
      PLUGIN_WORKING_PATH: ${PLUGIN_WORKING_PATH:-/app/storage/cwd}
-      FORCE_VERIFYING_SIGNATURE: ${FORCE_VERIFYING_SIGNATURE:-true}
      PYTHON_ENV_INIT_TIMEOUT: ${PLUGIN_PYTHON_ENV_INIT_TIMEOUT:-120}
      PLUGIN_MAX_EXECUTION_TIMEOUT: ${PLUGIN_MAX_EXECUTION_TIMEOUT:-600}
      PIP_MIRROR_URL: ${PIP_MIRROR_URL:-}
@@ -126,6 +139,9 @@ services:
      VOLCENGINE_TOS_ACCESS_KEY: ${PLUGIN_VOLCENGINE_TOS_ACCESS_KEY:-}
      VOLCENGINE_TOS_SECRET_KEY: ${PLUGIN_VOLCENGINE_TOS_SECRET_KEY:-}
      VOLCENGINE_TOS_REGION: ${PLUGIN_VOLCENGINE_TOS_REGION:-}
+      THIRD_PARTY_SIGNATURE_VERIFICATION_ENABLED: true
+      THIRD_PARTY_SIGNATURE_VERIFICATION_PUBLIC_KEYS: /app/keys/publickey.pem
+      FORCE_VERIFYING_SIGNATURE: false
    ports:
      - "${EXPOSE_PLUGIN_DAEMON_PORT:-5002}:${PLUGIN_DAEMON_PORT:-5002}"
      - "${EXPOSE_PLUGIN_DEBUGGING_PORT:-5003}:${PLUGIN_DEBUGGING_PORT:-5003}"
@@ -141,7 +157,12 @@ services:
    volumes:
      - ./ssrf_proxy/squid.conf.template:/etc/squid/squid.conf.template
      - ./ssrf_proxy/docker-entrypoint.sh:/docker-entrypoint-mount.sh
-    entrypoint: [ "sh", "-c", "cp /docker-entrypoint-mount.sh /docker-entrypoint.sh && sed -i 's/\r$$//' /docker-entrypoint.sh && chmod +x /docker-entrypoint.sh && /docker-entrypoint.sh" ]
+    entrypoint:
+      [
+        "sh",
+        "-c",
+        "cp /docker-entrypoint-mount.sh /docker-entrypoint.sh && sed -i 's/\r$$//' /docker-entrypoint.sh && chmod +x /docker-entrypoint.sh && /docker-entrypoint.sh",
+      ]
    env_file:
      - ./middleware.env
    environment:
--- a/docker/docker-compose.yaml
+++ b/docker/docker-compose.yaml
@@ -10,7 +10,7 @@ x-shared-env: &shared-api-worker-env
  SERVICE_API_URL: ${SERVICE_API_URL:-}
  APP_API_URL: ${APP_API_URL:-}
  APP_WEB_URL: ${APP_WEB_URL:-}
-  FILES_URL: ${FILES_URL:-}
+  FILES_URL: ${FILES_URL:-http://api:5001}
  INTERNAL_FILES_URL: ${INTERNAL_FILES_URL:-}
  LANG: ${LANG:-en_US.UTF-8}
  LC_ALL: ${LC_ALL:-en_US.UTF-8}
@@ -62,6 +62,7 @@ x-shared-env: &shared-api-worker-env
  SQLALCHEMY_ECHO: ${SQLALCHEMY_ECHO:-false}
  SQLALCHEMY_POOL_PRE_PING: ${SQLALCHEMY_POOL_PRE_PING:-false}
  SQLALCHEMY_POOL_USE_LIFO: ${SQLALCHEMY_POOL_USE_LIFO:-false}
+  SQLALCHEMY_POOL_TIMEOUT: ${SQLALCHEMY_POOL_TIMEOUT:-30}
  POSTGRES_MAX_CONNECTIONS: ${POSTGRES_MAX_CONNECTIONS:-100}
  POSTGRES_SHARED_BUFFERS: ${POSTGRES_SHARED_BUFFERS:-128MB}
  POSTGRES_WORK_MEM: ${POSTGRES_WORK_MEM:-4MB}
@@ -285,6 +286,8 @@ x-shared-env: &shared-api-worker-env
  BAIDU_VECTOR_DB_DATABASE: ${BAIDU_VECTOR_DB_DATABASE:-dify}
  BAIDU_VECTOR_DB_SHARD: ${BAIDU_VECTOR_DB_SHARD:-1}
  BAIDU_VECTOR_DB_REPLICAS: ${BAIDU_VECTOR_DB_REPLICAS:-3}
+  BAIDU_VECTOR_DB_INVERTED_INDEX_ANALYZER: ${BAIDU_VECTOR_DB_INVERTED_INDEX_ANALYZER:-DEFAULT_ANALYZER}
+  BAIDU_VECTOR_DB_INVERTED_INDEX_PARSER_MODE: ${BAIDU_VECTOR_DB_INVERTED_INDEX_PARSER_MODE:-COARSE_MODE}
  VIKINGDB_ACCESS_KEY: ${VIKINGDB_ACCESS_KEY:-your-ak}
  VIKINGDB_SECRET_KEY: ${VIKINGDB_SECRET_KEY:-your-sk}
  VIKINGDB_REGION: ${VIKINGDB_REGION:-cn-shanghai}
@@ -292,9 +295,10 @@ x-shared-env: &shared-api-worker-env
  VIKINGDB_SCHEMA: ${VIKINGDB_SCHEMA:-http}
  VIKINGDB_CONNECTION_TIMEOUT: ${VIKINGDB_CONNECTION_TIMEOUT:-30}
  VIKINGDB_SOCKET_TIMEOUT: ${VIKINGDB_SOCKET_TIMEOUT:-30}
-  LINDORM_URL: ${LINDORM_URL:-http://lindorm:30070}
-  LINDORM_USERNAME: ${LINDORM_USERNAME:-lindorm}
-  LINDORM_PASSWORD: ${LINDORM_PASSWORD:-lindorm}
+  LINDORM_URL: ${LINDORM_URL:-http://localhost:30070}
+  LINDORM_USERNAME: ${LINDORM_USERNAME:-admin}
+  LINDORM_PASSWORD: ${LINDORM_PASSWORD:-admin}
+  LINDORM_USING_UGC: ${LINDORM_USING_UGC:-True}
  LINDORM_QUERY_TIMEOUT: ${LINDORM_QUERY_TIMEOUT:-1}
  OCEANBASE_VECTOR_HOST: ${OCEANBASE_VECTOR_HOST:-oceanbase}
  OCEANBASE_VECTOR_PORT: ${OCEANBASE_VECTOR_PORT:-2881}
@@ -304,6 +308,7 @@ x-shared-env: &shared-api-worker-env
  OCEANBASE_CLUSTER_NAME: ${OCEANBASE_CLUSTER_NAME:-difyai}
  OCEANBASE_MEMORY_LIMIT: ${OCEANBASE_MEMORY_LIMIT:-6G}
  OCEANBASE_ENABLE_HYBRID_SEARCH: ${OCEANBASE_ENABLE_HYBRID_SEARCH:-false}
+  OCEANBASE_FULLTEXT_PARSER: ${OCEANBASE_FULLTEXT_PARSER:-ik}
  OPENGAUSS_HOST: ${OPENGAUSS_HOST:-opengauss}
  OPENGAUSS_PORT: ${OPENGAUSS_PORT:-6600}
  OPENGAUSS_USER: ${OPENGAUSS_USER:-postgres}
@@ -372,6 +377,7 @@ x-shared-env: &shared-api-worker-env
  INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH: ${INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH:-4000}
  INVITE_EXPIRY_HOURS: ${INVITE_EXPIRY_HOURS:-72}
  RESET_PASSWORD_TOKEN_EXPIRY_MINUTES: ${RESET_PASSWORD_TOKEN_EXPIRY_MINUTES:-5}
+  EMAIL_REGISTER_TOKEN_EXPIRY_MINUTES: ${EMAIL_REGISTER_TOKEN_EXPIRY_MINUTES:-5}
  CHANGE_EMAIL_TOKEN_EXPIRY_MINUTES: ${CHANGE_EMAIL_TOKEN_EXPIRY_MINUTES:-5}
  OWNER_TRANSFER_TOKEN_EXPIRY_MINUTES: ${OWNER_TRANSFER_TOKEN_EXPIRY_MINUTES:-5}
  CODE_EXECUTION_ENDPOINT: ${CODE_EXECUTION_ENDPOINT:-http://sandbox:8194}
@@ -394,6 +400,10 @@ x-shared-env: &shared-api-worker-env
  MAX_VARIABLE_SIZE: ${MAX_VARIABLE_SIZE:-204800}
  WORKFLOW_PARALLEL_DEPTH_LIMIT: ${WORKFLOW_PARALLEL_DEPTH_LIMIT:-3}
  WORKFLOW_FILE_UPLOAD_LIMIT: ${WORKFLOW_FILE_UPLOAD_LIMIT:-10}
+  GRAPH_ENGINE_MIN_WORKERS: ${GRAPH_ENGINE_MIN_WORKERS:-1}
+  GRAPH_ENGINE_MAX_WORKERS: ${GRAPH_ENGINE_MAX_WORKERS:-10}
+  GRAPH_ENGINE_SCALE_UP_THRESHOLD: ${GRAPH_ENGINE_SCALE_UP_THRESHOLD:-3}
+  GRAPH_ENGINE_SCALE_DOWN_IDLE_TIME: ${GRAPH_ENGINE_SCALE_DOWN_IDLE_TIME:-5.0}
  WORKFLOW_NODE_EXECUTION_STORAGE: ${WORKFLOW_NODE_EXECUTION_STORAGE:-rdbms}
  CORE_WORKFLOW_EXECUTION_REPOSITORY: ${CORE_WORKFLOW_EXECUTION_REPOSITORY:-core.repositories.sqlalchemy_workflow_execution_repository.SQLAlchemyWorkflowExecutionRepository}
  CORE_WORKFLOW_NODE_EXECUTION_REPOSITORY: ${CORE_WORKFLOW_NODE_EXECUTION_REPOSITORY:-core.repositories.sqlalchemy_workflow_node_execution_repository.SQLAlchemyWorkflowNodeExecutionRepository}
@@ -570,6 +580,7 @@ x-shared-env: &shared-api-worker-env
  QUEUE_MONITOR_INTERVAL: ${QUEUE_MONITOR_INTERVAL:-30}
  SWAGGER_UI_ENABLED: ${SWAGGER_UI_ENABLED:-true}
  SWAGGER_UI_PATH: ${SWAGGER_UI_PATH:-/swagger-ui.html}
+  DSL_EXPORT_ENCRYPT_DATASET_ID: ${DSL_EXPORT_ENCRYPT_DATASET_ID:-true}
  ENABLE_CLEAN_EMBEDDING_CACHE_TASK: ${ENABLE_CLEAN_EMBEDDING_CACHE_TASK:-false}
  ENABLE_CLEAN_UNUSED_DATASETS_TASK: ${ENABLE_CLEAN_UNUSED_DATASETS_TASK:-false}
  ENABLE_CREATE_TIDB_SERVERLESS_TASK: ${ENABLE_CREATE_TIDB_SERVERLESS_TASK:-false}
@@ -582,7 +593,7 @@ x-shared-env: &shared-api-worker-env
 services:
  # API service
  api:
-    image: langgenius/dify-api:1.8.0
+    image: langgenius/dify-api:1.9.0
    restart: always
    environment:
      # Use the shared environment variables.
@@ -611,7 +622,7 @@ services:
  # worker service
  # The Celery worker for processing the queue.
  worker:
-    image: langgenius/dify-api:1.8.0
+    image: langgenius/dify-api:1.9.0
    restart: always
    environment:
      # Use the shared environment variables.
@@ -638,7 +649,7 @@ services:
  # worker_beat service
  # Celery beat for scheduling periodic tasks.
  worker_beat:
-    image: langgenius/dify-api:1.8.0
+    image: langgenius/dify-api:1.9.0
    restart: always
    environment:
      # Use the shared environment variables.
@@ -656,7 +667,7 @@ services:

  # Frontend web application.
  web:
-    image: langgenius/dify-web:1.8.0
+    image: langgenius/dify-web:1.9.0
    restart: always
    environment:
      CONSOLE_API_URL: ${CONSOLE_API_URL:-}
@@ -698,7 +709,17 @@ services:
    volumes:
      - ./volumes/db/data:/var/lib/postgresql/data
    healthcheck:
-      test: [ 'CMD', 'pg_isready', '-h', 'db', '-U', '${PGUSER:-postgres}', '-d', '${POSTGRES_DB:-dify}' ]
+      test:
+        [
+          "CMD",
+          "pg_isready",
+          "-h",
+          "db",
+          "-U",
+          "${PGUSER:-postgres}",
+          "-d",
+          "${POSTGRES_DB:-dify}",
+        ]
      interval: 1s
      timeout: 3s
      retries: 60
@@ -715,7 +736,11 @@ services:
    # Set the redis password when startup redis server.
    command: redis-server --requirepass ${REDIS_PASSWORD:-difyai123456}
    healthcheck:
-      test: [ 'CMD-SHELL', 'redis-cli -a ${REDIS_PASSWORD:-difyai123456} ping | grep -q PONG' ]
+      test:
+        [
+          "CMD-SHELL",
+          "redis-cli -a ${REDIS_PASSWORD:-difyai123456} ping | grep -q PONG",
+        ]

  # The DifySandbox
  sandbox:
@@ -737,13 +762,13 @@ services:
      - ./volumes/sandbox/dependencies:/dependencies
      - ./volumes/sandbox/conf:/conf
    healthcheck:
-      test: [ 'CMD', 'curl', '-f', 'http://localhost:8194/health' ]
+      test: ["CMD", "curl", "-f", "http://localhost:8194/health"]
    networks:
      - ssrf_proxy_network

  # plugin daemon
  plugin_daemon:
-    image: langgenius/dify-plugin-daemon:0.2.0-local
+    image: langgenius/dify-plugin-daemon:0.3.0-local
    restart: always
    environment:
      # Use the shared environment variables.
@@ -811,7 +836,12 @@ services:
    volumes:
      - ./ssrf_proxy/squid.conf.template:/etc/squid/squid.conf.template
      - ./ssrf_proxy/docker-entrypoint.sh:/docker-entrypoint-mount.sh
-    entrypoint: [ 'sh', '-c', "cp /docker-entrypoint-mount.sh /docker-entrypoint.sh && sed -i 's/\r$$//' /docker-entrypoint.sh && chmod +x /docker-entrypoint.sh && /docker-entrypoint.sh" ]
+    entrypoint:
+      [
+        "sh",
+        "-c",
+        "cp /docker-entrypoint-mount.sh /docker-entrypoint.sh && sed -i 's/\r$$//' /docker-entrypoint.sh && chmod +x /docker-entrypoint.sh && /docker-entrypoint.sh",
+      ]
    environment:
      # pls clearly modify the squid env vars to fit your network environment.
      HTTP_PORT: ${SSRF_HTTP_PORT:-3128}
@@ -840,8 +870,8 @@ services:
      - CERTBOT_EMAIL=${CERTBOT_EMAIL}
      - CERTBOT_DOMAIN=${CERTBOT_DOMAIN}
      - CERTBOT_OPTIONS=${CERTBOT_OPTIONS:-}
-    entrypoint: [ '/docker-entrypoint.sh' ]
-    command: [ 'tail', '-f', '/dev/null' ]
+    entrypoint: ["/docker-entrypoint.sh"]
+    command: ["tail", "-f", "/dev/null"]

  # The nginx reverse proxy.
  # used for reverse proxying the API service and Web service.
@@ -858,7 +888,12 @@ services:
      - ./volumes/certbot/conf/live:/etc/letsencrypt/live # cert dir (with certbot container)
      - ./volumes/certbot/conf:/etc/letsencrypt
      - ./volumes/certbot/www:/var/www/html
-    entrypoint: [ 'sh', '-c', "cp /docker-entrypoint-mount.sh /docker-entrypoint.sh && sed -i 's/\r$$//' /docker-entrypoint.sh && chmod +x /docker-entrypoint.sh && /docker-entrypoint.sh" ]
+    entrypoint:
+      [
+        "sh",
+        "-c",
+        "cp /docker-entrypoint-mount.sh /docker-entrypoint.sh && sed -i 's/\r$$//' /docker-entrypoint.sh && chmod +x /docker-entrypoint.sh && /docker-entrypoint.sh",
+      ]
    environment:
      NGINX_SERVER_NAME: ${NGINX_SERVER_NAME:-_}
      NGINX_HTTPS_ENABLED: ${NGINX_HTTPS_ENABLED:-false}
@@ -880,14 +915,14 @@ services:
      - api
      - web
    ports:
-      - '${EXPOSE_NGINX_PORT:-80}:${NGINX_PORT:-80}'
-      - '${EXPOSE_NGINX_SSL_PORT:-443}:${NGINX_SSL_PORT:-443}'
+      - "${EXPOSE_NGINX_PORT:-80}:${NGINX_PORT:-80}"
+      - "${EXPOSE_NGINX_SSL_PORT:-443}:${NGINX_SSL_PORT:-443}"

  # The Weaviate vector store.
  weaviate:
    image: semitechnologies/weaviate:1.19.0
    profiles:
-      - ''
+      - ""
      - weaviate
    restart: always
    volumes:
@@ -940,13 +975,17 @@ services:
    working_dir: /opt/couchbase
    stdin_open: true
    tty: true
-    entrypoint: [ "" ]
+    entrypoint: [""]
    command: sh -c "/opt/couchbase/init/init-cbserver.sh"
    volumes:
      - ./volumes/couchbase/data:/opt/couchbase/var/lib/couchbase/data
    healthcheck:
      # ensure bucket was created before proceeding
-      test: [ "CMD-SHELL", "curl -s -f -u Administrator:password http://localhost:8091/pools/default/buckets | grep -q '\\[{' || exit 1" ]
+      test:
+        [
+          "CMD-SHELL",
+          "curl -s -f -u Administrator:password http://localhost:8091/pools/default/buckets | grep -q '\\[{' || exit 1",
+        ]
      interval: 10s
      retries: 10
      start_period: 30s
@@ -972,9 +1011,9 @@ services:
    volumes:
      - ./volumes/pgvector/data:/var/lib/postgresql/data
      - ./pgvector/docker-entrypoint.sh:/docker-entrypoint.sh
-    entrypoint: [ '/docker-entrypoint.sh' ]
+    entrypoint: ["/docker-entrypoint.sh"]
    healthcheck:
-      test: [ 'CMD', 'pg_isready' ]
+      test: ["CMD", "pg_isready"]
      interval: 1s
      timeout: 3s
      retries: 30
@@ -991,14 +1030,14 @@ services:
      - VB_USERNAME=dify
      - VB_PASSWORD=Difyai123456
    ports:
-      - '5434:5432'
+      - "5434:5432"
    volumes:
      - ./vastbase/lic:/home/vastbase/vastbase/lic
      - ./vastbase/data:/home/vastbase/data
      - ./vastbase/backup:/home/vastbase/backup
      - ./vastbase/backup_log:/home/vastbase/backup_log
    healthcheck:
-      test: [ 'CMD', 'pg_isready' ]
+      test: ["CMD", "pg_isready"]
      interval: 1s
      timeout: 3s
      retries: 30
@@ -1020,7 +1059,7 @@ services:
    volumes:
      - ./volumes/pgvecto_rs/data:/var/lib/postgresql/data
    healthcheck:
-      test: [ 'CMD', 'pg_isready' ]
+      test: ["CMD", "pg_isready"]
      interval: 1s
      timeout: 3s
      retries: 30
@@ -1056,10 +1095,15 @@ services:
      OB_CLUSTER_NAME: ${OCEANBASE_CLUSTER_NAME:-difyai}
      OB_SERVER_IP: 127.0.0.1
      MODE: mini
+      LANG: en_US.UTF-8
    ports:
      - "${OCEANBASE_VECTOR_PORT:-2881}:2881"
    healthcheck:
-      test: [ 'CMD-SHELL', 'obclient -h127.0.0.1 -P2881 -uroot@test -p$${OB_TENANT_PASSWORD} -e "SELECT 1;"' ]
+      test:
+        [
+          "CMD-SHELL",
+          'obclient -h127.0.0.1 -P2881 -uroot@test -p$${OB_TENANT_PASSWORD} -e "SELECT 1;"',
+        ]
      interval: 10s
      retries: 30
      start_period: 30s
@@ -1095,7 +1139,7 @@ services:
      - ./volumes/milvus/etcd:/etcd
    command: etcd -advertise-client-urls=http://127.0.0.1:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd
    healthcheck:
-      test: [ 'CMD', 'etcdctl', 'endpoint', 'health' ]
+      test: ["CMD", "etcdctl", "endpoint", "health"]
      interval: 30s
      timeout: 20s
      retries: 3
@@ -1114,7 +1158,7 @@ services:
      - ./volumes/milvus/minio:/minio_data
    command: minio server /minio_data --console-address ":9001"
    healthcheck:
-      test: [ 'CMD', 'curl', '-f', 'http://localhost:9000/minio/health/live' ]
+      test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
      interval: 30s
      timeout: 20s
      retries: 3
@@ -1126,7 +1170,7 @@ services:
    image: milvusdb/milvus:v2.5.15
    profiles:
      - milvus
-    command: [ 'milvus', 'run', 'standalone' ]
+    command: ["milvus", "run", "standalone"]
    environment:
      ETCD_ENDPOINTS: ${ETCD_ENDPOINTS:-etcd:2379}
      MINIO_ADDRESS: ${MINIO_ADDRESS:-minio:9000}
@@ -1134,7 +1178,7 @@ services:
    volumes:
      - ./volumes/milvus/milvus:/var/lib/milvus
    healthcheck:
-      test: [ 'CMD', 'curl', '-f', 'http://localhost:9091/healthz' ]
+      test: ["CMD", "curl", "-f", "http://localhost:9091/healthz"]
      interval: 30s
      start_period: 90s
      timeout: 20s
@@ -1200,7 +1244,7 @@ services:
    volumes:
      - ./volumes/opengauss/data:/var/lib/opengauss/data
    healthcheck:
-      test: [ "CMD-SHELL", "netstat -lntp | grep tcp6 > /dev/null 2>&1" ]
+      test: ["CMD-SHELL", "netstat -lntp | grep tcp6 > /dev/null 2>&1"]
      interval: 10s
      timeout: 10s
      retries: 10
@@ -1253,18 +1297,19 @@ services:
      node.name: dify-es0
      discovery.type: single-node
      xpack.license.self_generated.type: basic
-      xpack.security.enabled: 'true'
-      xpack.security.enrollment.enabled: 'false'
-      xpack.security.http.ssl.enabled: 'false'
+      xpack.security.enabled: "true"
+      xpack.security.enrollment.enabled: "false"
+      xpack.security.http.ssl.enabled: "false"
    ports:
      - ${ELASTICSEARCH_PORT:-9200}:9200
    deploy:
      resources:
        limits:
          memory: 2g
-    entrypoint: [ 'sh', '-c', "sh /docker-entrypoint-mount.sh" ]
+    entrypoint: ["sh", "-c", "sh /docker-entrypoint-mount.sh"]
    healthcheck:
-      test: [ 'CMD', 'curl', '-s', 'http://localhost:9200/_cluster/health?pretty' ]
+      test:
+        ["CMD", "curl", "-s", "http://localhost:9200/_cluster/health?pretty"]
      interval: 30s
      timeout: 10s
      retries: 50
@@ -1282,17 +1327,17 @@ services:
    environment:
      XPACK_ENCRYPTEDSAVEDOBJECTS_ENCRYPTIONKEY: d1a66dfd-c4d3-4a0a-8290-2abcb83ab3aa
      NO_PROXY: localhost,127.0.0.1,elasticsearch,kibana
-      XPACK_SECURITY_ENABLED: 'true'
-      XPACK_SECURITY_ENROLLMENT_ENABLED: 'false'
-      XPACK_SECURITY_HTTP_SSL_ENABLED: 'false'
-      XPACK_FLEET_ISAIRGAPPED: 'true'
+      XPACK_SECURITY_ENABLED: "true"
+      XPACK_SECURITY_ENROLLMENT_ENABLED: "false"
+      XPACK_SECURITY_HTTP_SSL_ENABLED: "false"
+      XPACK_FLEET_ISAIRGAPPED: "true"
      I18N_LOCALE: zh-CN
-      SERVER_PORT: '5601'
+      SERVER_PORT: "5601"
      ELASTICSEARCH_HOSTS: http://elasticsearch:9200
    ports:
      - ${KIBANA_PORT:-5601}:5601
    healthcheck:
-      test: [ 'CMD-SHELL', 'curl -s http://localhost:5601 >/dev/null || exit 1' ]
+      test: ["CMD-SHELL", "curl -s http://localhost:5601 >/dev/null || exit 1"]
      interval: 30s
      timeout: 10s
      retries: 3
--- a/docker/middleware.env.example
+++ b/docker/middleware.env.example
@@ -79,6 +79,17 @@ WEAVIATE_AUTHORIZATION_ADMINLIST_ENABLED=true
 WEAVIATE_AUTHORIZATION_ADMINLIST_USERS=hello@dify.ai
 WEAVIATE_HOST_VOLUME=./volumes/weaviate

+# ------------------------------
+# Environment Variables for Pinecone Vector Database
+# ------------------------------
+# Get your API key from: https://app.pinecone.io/
+# PINECONE_API_KEY=your-pinecone-api-key
+# PINECONE_ENVIRONMENT=us-west1-gcp
+# PINECONE_INDEX_NAME=dify-pinecone-index
+# PINECONE_CLIENT_TIMEOUT=30
+# PINECONE_BATCH_SIZE=100
+# PINECONE_METRIC=cosine
+
 # ------------------------------
 # Docker Compose Service Expose Host Port Configurations
 # ------------------------------
--- a/web/app/components/datasets/external-knowledge-base/create/index.tsx
+++ b/web/app/components/datasets/external-knowledge-base/create/index.tsx
@@ -28,7 +28,7 @@ const ExternalKnowledgeBaseCreate: React.FC<ExternalKnowledgeBaseCreateProps> =
    external_knowledge_api_id: '',
    external_knowledge_id: '',
    external_retrieval_model: {
-      top_k: 2,
+      top_k: 4,
      score_threshold: 0.5,
      score_threshold_enabled: false,
    },
--- a/web/app/components/datasets/hit-testing/textarea.tsx
+++ b/web/app/components/datasets/hit-testing/textarea.tsx
@@ -49,7 +49,7 @@ const TextAreaWithButton = ({
  const { t } = useTranslation()
  const [isSettingsOpen, setIsSettingsOpen] = useState(false)
  const [externalRetrievalSettings, setExternalRetrievalSettings] = useState({
-    top_k: 2,
+    top_k: 4,
    score_threshold: 0.5,
    score_threshold_enabled: false,
  })
--- a/web/context/debug-configuration.ts
+++ b/web/context/debug-configuration.ts
@@ -233,7 +233,7 @@ const DebugConfigurationContext = createContext<IDebugConfiguration>({
      reranking_provider_name: '',
      reranking_model_name: '',
    },
-    top_k: 2,
+    top_k: 4,
    score_threshold_enabled: false,
    score_threshold: 0.7,
    datasets: {
Author	SHA1	Message	Date
Frederick2313072	594906c1ff	fix: MD5 and 8‑hex Suffix Collision Risk	2025-09-24 17:01:23 +08:00
Frederick2313072	80f8245f2e	fix(api): sync api/uv.lock with main to resolve binary diff	2025-09-24 12:00:50 +08:00
Frederick2313072	a12b437c16	fix(api): sync api/uv.lock with main to resolve binary diff	2025-09-24 11:58:07 +08:00
Frederick2313072	12de554313	fix: add index initialization checks, improve batch vector operations and search, ensure robust exception handling.	2025-09-23 16:41:46 +08:00
Frederick2313072	1f36c0c1c5	sync docker compose files with main branch	2025-09-23 00:12:54 +08:00
Frederick2313072	8b9297563c	fix	2025-09-23 00:03:31 +08:00
Frederick2313072	1cbe9eedb6	fix(pinecone): normalize index names and sanitize metadata to meet API constraints	2025-09-20 02:56:53 +08:00
Frederick2313072	90fc5a1f12	pipecone	2025-09-16 08:57:46 +08:00
Frederick2313072	41dfdf1ac0	fix:score threshold	2025-09-01 16:34:17 +08:00
Frederick2313072	dd7de74aa6	修复top-k硬编码回退问题	2025-09-01 14:27:43 +08:00