Compare commits

...

1 Commits

Author SHA1 Message Date
-LAN-
338fbd9017 fix: update keyword extraction to remove optional parameter and improve type casting
Signed-off-by: -LAN- <laipz8200@outlook.com>
2024-12-26 19:47:49 +08:00
3 changed files with 11 additions and 9 deletions

View File

@@ -1,5 +1,5 @@
import re
from typing import Optional
from typing import cast
class JiebaKeywordTableHandler:
@@ -8,18 +8,20 @@ class JiebaKeywordTableHandler:
from core.rag.datasource.keyword.jieba.stopwords import STOPWORDS
jieba.analyse.default_tfidf.stop_words = STOPWORDS
jieba.analyse.default_tfidf.stop_words = STOPWORDS # type: ignore
def extract_keywords(self, text: str, max_keywords_per_chunk: Optional[int] = 10) -> set[str]:
def extract_keywords(self, text: str, max_keywords_per_chunk: int = 10) -> set[str]:
"""Extract keywords with JIEBA tfidf."""
import jieba # type: ignore
import jieba.analyse # type: ignore
keywords = jieba.analyse.extract_tags(
sentence=text,
topK=max_keywords_per_chunk,
)
# jieba.analyse.extract_tags returns list[Any] when withFlag is False by default.
keywords = cast(list[str], keywords)
return set(self._expand_tokens_with_subtokens(keywords))
return set(self._expand_tokens_with_subtokens(set(keywords)))
def _expand_tokens_with_subtokens(self, tokens: set[str]) -> set[str]:
"""Get subtokens from a list of tokens., filtering for stopwords."""

View File

@@ -72,11 +72,11 @@ class WeightRerankRunner(BaseRerankRunner):
:return:
"""
keyword_table_handler = JiebaKeywordTableHandler()
query_keywords = keyword_table_handler.extract_keywords(query, None)
query_keywords = keyword_table_handler.extract_keywords(query)
documents_keywords = []
for document in documents:
# get the document keywords
document_keywords = keyword_table_handler.extract_keywords(document.page_content, None)
document_keywords = keyword_table_handler.extract_keywords(document.page_content)
if document.metadata is not None:
document.metadata["keywords"] = document_keywords
documents_keywords.append(document_keywords)

View File

@@ -625,12 +625,12 @@ class DatasetRetrieval:
:return:
"""
keyword_table_handler = JiebaKeywordTableHandler()
query_keywords = keyword_table_handler.extract_keywords(query, None)
query_keywords = keyword_table_handler.extract_keywords(query)
documents_keywords = []
for document in documents:
if document.metadata is not None:
# get the document keywords
document_keywords = keyword_table_handler.extract_keywords(document.page_content, None)
document_keywords = keyword_table_handler.extract_keywords(document.page_content)
document.metadata["keywords"] = document_keywords
documents_keywords.append(document_keywords)