mirror of
https://gitee.com/infiniflow/ragflow.git
synced 2025-12-06 07:19:03 +08:00
Delete:remove unused tests (#11749)
### What problem does this PR solve? change: remove unused tests ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
@@ -1,323 +0,0 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
"""
|
||||
Standalone test to demonstrate the RAG evaluation test framework works.
|
||||
This test doesn't require RAGFlow dependencies.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from unittest.mock import Mock
|
||||
|
||||
|
||||
class TestEvaluationFrameworkDemo:
|
||||
"""Demo tests to verify the evaluation test framework is working"""
|
||||
|
||||
def test_basic_assertion(self):
|
||||
"""Test basic assertion works"""
|
||||
assert 1 + 1 == 2
|
||||
|
||||
def test_mock_evaluation_service(self):
|
||||
"""Test mocking evaluation service"""
|
||||
mock_service = Mock()
|
||||
mock_service.create_dataset.return_value = (True, "dataset_123")
|
||||
|
||||
success, dataset_id = mock_service.create_dataset(
|
||||
name="Test Dataset",
|
||||
kb_ids=["kb_1"]
|
||||
)
|
||||
|
||||
assert success is True
|
||||
assert dataset_id == "dataset_123"
|
||||
mock_service.create_dataset.assert_called_once()
|
||||
|
||||
def test_mock_test_case_addition(self):
|
||||
"""Test mocking test case addition"""
|
||||
mock_service = Mock()
|
||||
mock_service.add_test_case.return_value = (True, "case_123")
|
||||
|
||||
success, case_id = mock_service.add_test_case(
|
||||
dataset_id="dataset_123",
|
||||
question="Test question?",
|
||||
reference_answer="Test answer"
|
||||
)
|
||||
|
||||
assert success is True
|
||||
assert case_id == "case_123"
|
||||
|
||||
def test_mock_evaluation_run(self):
|
||||
"""Test mocking evaluation run"""
|
||||
mock_service = Mock()
|
||||
mock_service.start_evaluation.return_value = (True, "run_123")
|
||||
|
||||
success, run_id = mock_service.start_evaluation(
|
||||
dataset_id="dataset_123",
|
||||
dialog_id="dialog_456",
|
||||
user_id="user_1"
|
||||
)
|
||||
|
||||
assert success is True
|
||||
assert run_id == "run_123"
|
||||
|
||||
def test_mock_metrics_computation(self):
|
||||
"""Test mocking metrics computation"""
|
||||
mock_service = Mock()
|
||||
|
||||
# Mock retrieval metrics
|
||||
metrics = {
|
||||
"precision": 0.85,
|
||||
"recall": 0.78,
|
||||
"f1_score": 0.81,
|
||||
"hit_rate": 1.0,
|
||||
"mrr": 0.9
|
||||
}
|
||||
mock_service._compute_retrieval_metrics.return_value = metrics
|
||||
|
||||
result = mock_service._compute_retrieval_metrics(
|
||||
retrieved_ids=["chunk_1", "chunk_2", "chunk_3"],
|
||||
relevant_ids=["chunk_1", "chunk_2", "chunk_4"]
|
||||
)
|
||||
|
||||
assert result["precision"] == 0.85
|
||||
assert result["recall"] == 0.78
|
||||
assert result["f1_score"] == 0.81
|
||||
|
||||
def test_mock_recommendations(self):
|
||||
"""Test mocking recommendations"""
|
||||
mock_service = Mock()
|
||||
|
||||
recommendations = [
|
||||
{
|
||||
"issue": "Low Precision",
|
||||
"severity": "high",
|
||||
"suggestions": [
|
||||
"Increase similarity_threshold",
|
||||
"Enable reranking"
|
||||
]
|
||||
}
|
||||
]
|
||||
mock_service.get_recommendations.return_value = recommendations
|
||||
|
||||
recs = mock_service.get_recommendations("run_123")
|
||||
|
||||
assert len(recs) == 1
|
||||
assert recs[0]["issue"] == "Low Precision"
|
||||
assert len(recs[0]["suggestions"]) == 2
|
||||
|
||||
@pytest.mark.parametrize("precision,recall,expected_f1", [
|
||||
(1.0, 1.0, 1.0),
|
||||
(0.8, 0.6, 0.69),
|
||||
(0.5, 0.5, 0.5),
|
||||
(0.0, 0.0, 0.0),
|
||||
])
|
||||
def test_f1_score_calculation(self, precision, recall, expected_f1):
|
||||
"""Test F1 score calculation with different inputs"""
|
||||
if precision + recall > 0:
|
||||
f1 = 2 * (precision * recall) / (precision + recall)
|
||||
else:
|
||||
f1 = 0.0
|
||||
|
||||
assert abs(f1 - expected_f1) < 0.01
|
||||
|
||||
def test_dataset_list_structure(self):
|
||||
"""Test dataset list structure"""
|
||||
mock_service = Mock()
|
||||
|
||||
expected_result = {
|
||||
"total": 3,
|
||||
"datasets": [
|
||||
{"id": "dataset_1", "name": "Dataset 1"},
|
||||
{"id": "dataset_2", "name": "Dataset 2"},
|
||||
{"id": "dataset_3", "name": "Dataset 3"}
|
||||
]
|
||||
}
|
||||
mock_service.list_datasets.return_value = expected_result
|
||||
|
||||
result = mock_service.list_datasets(
|
||||
tenant_id="tenant_1",
|
||||
user_id="user_1",
|
||||
page=1,
|
||||
page_size=10
|
||||
)
|
||||
|
||||
assert result["total"] == 3
|
||||
assert len(result["datasets"]) == 3
|
||||
assert result["datasets"][0]["id"] == "dataset_1"
|
||||
|
||||
def test_evaluation_run_status_flow(self):
|
||||
"""Test evaluation run status transitions"""
|
||||
mock_service = Mock()
|
||||
|
||||
# Simulate status progression
|
||||
statuses = ["PENDING", "RUNNING", "COMPLETED"]
|
||||
|
||||
for status in statuses:
|
||||
mock_run = {"id": "run_123", "status": status}
|
||||
mock_service.get_run_results.return_value = {"run": mock_run}
|
||||
|
||||
result = mock_service.get_run_results("run_123")
|
||||
assert result["run"]["status"] == status
|
||||
|
||||
def test_bulk_import_success_count(self):
|
||||
"""Test bulk import success/failure counting"""
|
||||
mock_service = Mock()
|
||||
|
||||
# Simulate 8 successes, 2 failures
|
||||
mock_service.import_test_cases.return_value = (8, 2)
|
||||
|
||||
success_count, failure_count = mock_service.import_test_cases(
|
||||
dataset_id="dataset_123",
|
||||
cases=[{"question": f"Q{i}"} for i in range(10)]
|
||||
)
|
||||
|
||||
assert success_count == 8
|
||||
assert failure_count == 2
|
||||
assert success_count + failure_count == 10
|
||||
|
||||
def test_metrics_summary_aggregation(self):
|
||||
"""Test metrics summary aggregation"""
|
||||
results = [
|
||||
{"metrics": {"precision": 0.9, "recall": 0.8}, "execution_time": 1.2},
|
||||
{"metrics": {"precision": 0.8, "recall": 0.7}, "execution_time": 1.5},
|
||||
{"metrics": {"precision": 0.85, "recall": 0.75}, "execution_time": 1.3}
|
||||
]
|
||||
|
||||
# Calculate averages
|
||||
avg_precision = sum(r["metrics"]["precision"] for r in results) / len(results)
|
||||
avg_recall = sum(r["metrics"]["recall"] for r in results) / len(results)
|
||||
avg_time = sum(r["execution_time"] for r in results) / len(results)
|
||||
|
||||
assert abs(avg_precision - 0.85) < 0.01
|
||||
assert abs(avg_recall - 0.75) < 0.01
|
||||
assert abs(avg_time - 1.33) < 0.01
|
||||
|
||||
def test_recommendation_severity_levels(self):
|
||||
"""Test recommendation severity levels"""
|
||||
severities = ["low", "medium", "high", "critical"]
|
||||
|
||||
for severity in severities:
|
||||
rec = {
|
||||
"issue": "Test Issue",
|
||||
"severity": severity,
|
||||
"suggestions": ["Fix it"]
|
||||
}
|
||||
assert rec["severity"] in severities
|
||||
|
||||
def test_empty_dataset_handling(self):
|
||||
"""Test handling of empty datasets"""
|
||||
mock_service = Mock()
|
||||
mock_service.get_test_cases.return_value = []
|
||||
|
||||
cases = mock_service.get_test_cases("empty_dataset")
|
||||
|
||||
assert len(cases) == 0
|
||||
assert isinstance(cases, list)
|
||||
|
||||
def test_error_handling(self):
|
||||
"""Test error handling in service"""
|
||||
mock_service = Mock()
|
||||
mock_service.create_dataset.return_value = (False, "Dataset name cannot be empty")
|
||||
|
||||
success, error = mock_service.create_dataset(name="", kb_ids=[])
|
||||
|
||||
assert success is False
|
||||
assert "empty" in error.lower()
|
||||
|
||||
def test_pagination_logic(self):
|
||||
"""Test pagination logic"""
|
||||
total_items = 50
|
||||
page_size = 10
|
||||
page = 2
|
||||
|
||||
# Calculate expected items for page 2
|
||||
start = (page - 1) * page_size
|
||||
end = min(start + page_size, total_items)
|
||||
expected_count = end - start
|
||||
|
||||
assert expected_count == 10
|
||||
assert start == 10
|
||||
assert end == 20
|
||||
|
||||
|
||||
class TestMetricsCalculations:
|
||||
"""Test metric calculation logic"""
|
||||
|
||||
def test_precision_calculation(self):
|
||||
"""Test precision calculation"""
|
||||
retrieved = {"chunk_1", "chunk_2", "chunk_3", "chunk_4"}
|
||||
relevant = {"chunk_1", "chunk_2", "chunk_5"}
|
||||
|
||||
precision = len(retrieved & relevant) / len(retrieved)
|
||||
|
||||
assert precision == 0.5 # 2 out of 4
|
||||
|
||||
def test_recall_calculation(self):
|
||||
"""Test recall calculation"""
|
||||
retrieved = {"chunk_1", "chunk_2", "chunk_3", "chunk_4"}
|
||||
relevant = {"chunk_1", "chunk_2", "chunk_5"}
|
||||
|
||||
recall = len(retrieved & relevant) / len(relevant)
|
||||
|
||||
assert abs(recall - 0.67) < 0.01 # 2 out of 3
|
||||
|
||||
def test_hit_rate_positive(self):
|
||||
"""Test hit rate when relevant chunk is found"""
|
||||
retrieved = {"chunk_1", "chunk_2", "chunk_3"}
|
||||
relevant = {"chunk_2", "chunk_4"}
|
||||
|
||||
hit_rate = 1.0 if (retrieved & relevant) else 0.0
|
||||
|
||||
assert hit_rate == 1.0
|
||||
|
||||
def test_hit_rate_negative(self):
|
||||
"""Test hit rate when no relevant chunk is found"""
|
||||
retrieved = {"chunk_1", "chunk_2", "chunk_3"}
|
||||
relevant = {"chunk_4", "chunk_5"}
|
||||
|
||||
hit_rate = 1.0 if (retrieved & relevant) else 0.0
|
||||
|
||||
assert hit_rate == 0.0
|
||||
|
||||
def test_mrr_calculation(self):
|
||||
"""Test MRR calculation"""
|
||||
retrieved_ids = ["chunk_1", "chunk_2", "chunk_3", "chunk_4"]
|
||||
relevant_ids = {"chunk_3", "chunk_5"}
|
||||
|
||||
mrr = 0.0
|
||||
for i, chunk_id in enumerate(retrieved_ids, 1):
|
||||
if chunk_id in relevant_ids:
|
||||
mrr = 1.0 / i
|
||||
break
|
||||
|
||||
assert abs(mrr - 0.33) < 0.01 # First relevant at position 3
|
||||
|
||||
|
||||
# Summary test
|
||||
def test_evaluation_framework_summary():
|
||||
"""
|
||||
Summary test to confirm all evaluation framework features work.
|
||||
This test verifies that:
|
||||
- Basic assertions work
|
||||
- Mocking works for all service methods
|
||||
- Metrics calculations are correct
|
||||
- Error handling works
|
||||
- Pagination logic works
|
||||
"""
|
||||
assert True, "Evaluation test framework is working correctly!"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
@@ -1,557 +0,0 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
"""
|
||||
Unit tests for RAG Evaluation Service
|
||||
|
||||
Tests cover:
|
||||
- Dataset management (CRUD operations)
|
||||
- Test case management
|
||||
- Evaluation execution
|
||||
- Metrics computation
|
||||
- Recommendations generation
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from unittest.mock import patch
|
||||
|
||||
|
||||
class TestEvaluationDatasetManagement:
|
||||
"""Tests for evaluation dataset management"""
|
||||
|
||||
@pytest.fixture
|
||||
def mock_evaluation_service(self):
|
||||
"""Create a mock EvaluationService"""
|
||||
with patch('api.db.services.evaluation_service.EvaluationService') as mock:
|
||||
yield mock
|
||||
|
||||
@pytest.fixture
|
||||
def sample_dataset_data(self):
|
||||
"""Sample dataset data for testing"""
|
||||
return {
|
||||
"name": "Customer Support QA",
|
||||
"description": "Test cases for customer support",
|
||||
"kb_ids": ["kb_123", "kb_456"],
|
||||
"tenant_id": "tenant_1",
|
||||
"user_id": "user_1"
|
||||
}
|
||||
|
||||
def test_create_dataset_success(self, mock_evaluation_service, sample_dataset_data):
|
||||
"""Test successful dataset creation"""
|
||||
mock_evaluation_service.create_dataset.return_value = (True, "dataset_123")
|
||||
|
||||
success, dataset_id = mock_evaluation_service.create_dataset(**sample_dataset_data)
|
||||
|
||||
assert success is True
|
||||
assert dataset_id == "dataset_123"
|
||||
mock_evaluation_service.create_dataset.assert_called_once()
|
||||
|
||||
def test_create_dataset_with_empty_name(self, mock_evaluation_service):
|
||||
"""Test dataset creation with empty name"""
|
||||
data = {
|
||||
"name": "",
|
||||
"description": "Test",
|
||||
"kb_ids": ["kb_123"],
|
||||
"tenant_id": "tenant_1",
|
||||
"user_id": "user_1"
|
||||
}
|
||||
|
||||
mock_evaluation_service.create_dataset.return_value = (False, "Dataset name cannot be empty")
|
||||
success, error = mock_evaluation_service.create_dataset(**data)
|
||||
|
||||
assert success is False
|
||||
assert "name" in error.lower() or "empty" in error.lower()
|
||||
|
||||
def test_create_dataset_with_empty_kb_ids(self, mock_evaluation_service):
|
||||
"""Test dataset creation with empty kb_ids"""
|
||||
data = {
|
||||
"name": "Test Dataset",
|
||||
"description": "Test",
|
||||
"kb_ids": [],
|
||||
"tenant_id": "tenant_1",
|
||||
"user_id": "user_1"
|
||||
}
|
||||
|
||||
mock_evaluation_service.create_dataset.return_value = (False, "kb_ids cannot be empty")
|
||||
success, error = mock_evaluation_service.create_dataset(**data)
|
||||
|
||||
assert success is False
|
||||
|
||||
def test_get_dataset_success(self, mock_evaluation_service):
|
||||
"""Test successful dataset retrieval"""
|
||||
expected_dataset = {
|
||||
"id": "dataset_123",
|
||||
"name": "Test Dataset",
|
||||
"kb_ids": ["kb_123"]
|
||||
}
|
||||
mock_evaluation_service.get_dataset.return_value = expected_dataset
|
||||
|
||||
dataset = mock_evaluation_service.get_dataset("dataset_123")
|
||||
|
||||
assert dataset is not None
|
||||
assert dataset["id"] == "dataset_123"
|
||||
|
||||
def test_get_dataset_not_found(self, mock_evaluation_service):
|
||||
"""Test getting non-existent dataset"""
|
||||
mock_evaluation_service.get_dataset.return_value = None
|
||||
|
||||
dataset = mock_evaluation_service.get_dataset("nonexistent")
|
||||
|
||||
assert dataset is None
|
||||
|
||||
def test_list_datasets(self, mock_evaluation_service):
|
||||
"""Test listing datasets"""
|
||||
expected_result = {
|
||||
"total": 2,
|
||||
"datasets": [
|
||||
{"id": "dataset_1", "name": "Dataset 1"},
|
||||
{"id": "dataset_2", "name": "Dataset 2"}
|
||||
]
|
||||
}
|
||||
mock_evaluation_service.list_datasets.return_value = expected_result
|
||||
|
||||
result = mock_evaluation_service.list_datasets(
|
||||
tenant_id="tenant_1",
|
||||
user_id="user_1",
|
||||
page=1,
|
||||
page_size=20
|
||||
)
|
||||
|
||||
assert result["total"] == 2
|
||||
assert len(result["datasets"]) == 2
|
||||
|
||||
def test_list_datasets_with_pagination(self, mock_evaluation_service):
|
||||
"""Test listing datasets with pagination"""
|
||||
mock_evaluation_service.list_datasets.return_value = {
|
||||
"total": 50,
|
||||
"datasets": [{"id": f"dataset_{i}"} for i in range(10)]
|
||||
}
|
||||
|
||||
result = mock_evaluation_service.list_datasets(
|
||||
tenant_id="tenant_1",
|
||||
user_id="user_1",
|
||||
page=2,
|
||||
page_size=10
|
||||
)
|
||||
|
||||
assert result["total"] == 50
|
||||
assert len(result["datasets"]) == 10
|
||||
|
||||
def test_update_dataset_success(self, mock_evaluation_service):
|
||||
"""Test successful dataset update"""
|
||||
mock_evaluation_service.update_dataset.return_value = True
|
||||
|
||||
success = mock_evaluation_service.update_dataset(
|
||||
"dataset_123",
|
||||
name="Updated Name",
|
||||
description="Updated Description"
|
||||
)
|
||||
|
||||
assert success is True
|
||||
|
||||
def test_update_dataset_not_found(self, mock_evaluation_service):
|
||||
"""Test updating non-existent dataset"""
|
||||
mock_evaluation_service.update_dataset.return_value = False
|
||||
|
||||
success = mock_evaluation_service.update_dataset(
|
||||
"nonexistent",
|
||||
name="Updated Name"
|
||||
)
|
||||
|
||||
assert success is False
|
||||
|
||||
def test_delete_dataset_success(self, mock_evaluation_service):
|
||||
"""Test successful dataset deletion"""
|
||||
mock_evaluation_service.delete_dataset.return_value = True
|
||||
|
||||
success = mock_evaluation_service.delete_dataset("dataset_123")
|
||||
|
||||
assert success is True
|
||||
|
||||
def test_delete_dataset_not_found(self, mock_evaluation_service):
|
||||
"""Test deleting non-existent dataset"""
|
||||
mock_evaluation_service.delete_dataset.return_value = False
|
||||
|
||||
success = mock_evaluation_service.delete_dataset("nonexistent")
|
||||
|
||||
assert success is False
|
||||
|
||||
|
||||
class TestEvaluationTestCaseManagement:
|
||||
"""Tests for test case management"""
|
||||
|
||||
@pytest.fixture
|
||||
def mock_evaluation_service(self):
|
||||
"""Create a mock EvaluationService"""
|
||||
with patch('api.db.services.evaluation_service.EvaluationService') as mock:
|
||||
yield mock
|
||||
|
||||
@pytest.fixture
|
||||
def sample_test_case(self):
|
||||
"""Sample test case data"""
|
||||
return {
|
||||
"dataset_id": "dataset_123",
|
||||
"question": "How do I reset my password?",
|
||||
"reference_answer": "Click on 'Forgot Password' and follow the email instructions.",
|
||||
"relevant_doc_ids": ["doc_789"],
|
||||
"relevant_chunk_ids": ["chunk_101", "chunk_102"]
|
||||
}
|
||||
|
||||
def test_add_test_case_success(self, mock_evaluation_service, sample_test_case):
|
||||
"""Test successful test case addition"""
|
||||
mock_evaluation_service.add_test_case.return_value = (True, "case_123")
|
||||
|
||||
success, case_id = mock_evaluation_service.add_test_case(**sample_test_case)
|
||||
|
||||
assert success is True
|
||||
assert case_id == "case_123"
|
||||
|
||||
def test_add_test_case_with_empty_question(self, mock_evaluation_service):
|
||||
"""Test adding test case with empty question"""
|
||||
mock_evaluation_service.add_test_case.return_value = (False, "Question cannot be empty")
|
||||
|
||||
success, error = mock_evaluation_service.add_test_case(
|
||||
dataset_id="dataset_123",
|
||||
question=""
|
||||
)
|
||||
|
||||
assert success is False
|
||||
assert "question" in error.lower() or "empty" in error.lower()
|
||||
|
||||
def test_add_test_case_without_reference_answer(self, mock_evaluation_service):
|
||||
"""Test adding test case without reference answer (optional)"""
|
||||
mock_evaluation_service.add_test_case.return_value = (True, "case_123")
|
||||
|
||||
success, case_id = mock_evaluation_service.add_test_case(
|
||||
dataset_id="dataset_123",
|
||||
question="Test question",
|
||||
reference_answer=None
|
||||
)
|
||||
|
||||
assert success is True
|
||||
|
||||
def test_get_test_cases(self, mock_evaluation_service):
|
||||
"""Test getting all test cases for a dataset"""
|
||||
expected_cases = [
|
||||
{"id": "case_1", "question": "Question 1"},
|
||||
{"id": "case_2", "question": "Question 2"}
|
||||
]
|
||||
mock_evaluation_service.get_test_cases.return_value = expected_cases
|
||||
|
||||
cases = mock_evaluation_service.get_test_cases("dataset_123")
|
||||
|
||||
assert len(cases) == 2
|
||||
assert cases[0]["id"] == "case_1"
|
||||
|
||||
def test_get_test_cases_empty_dataset(self, mock_evaluation_service):
|
||||
"""Test getting test cases from empty dataset"""
|
||||
mock_evaluation_service.get_test_cases.return_value = []
|
||||
|
||||
cases = mock_evaluation_service.get_test_cases("dataset_123")
|
||||
|
||||
assert len(cases) == 0
|
||||
|
||||
def test_delete_test_case_success(self, mock_evaluation_service):
|
||||
"""Test successful test case deletion"""
|
||||
mock_evaluation_service.delete_test_case.return_value = True
|
||||
|
||||
success = mock_evaluation_service.delete_test_case("case_123")
|
||||
|
||||
assert success is True
|
||||
|
||||
def test_import_test_cases_success(self, mock_evaluation_service):
|
||||
"""Test bulk import of test cases"""
|
||||
cases = [
|
||||
{"question": "Question 1", "reference_answer": "Answer 1"},
|
||||
{"question": "Question 2", "reference_answer": "Answer 2"},
|
||||
{"question": "Question 3", "reference_answer": "Answer 3"}
|
||||
]
|
||||
mock_evaluation_service.import_test_cases.return_value = (3, 0)
|
||||
|
||||
success_count, failure_count = mock_evaluation_service.import_test_cases(
|
||||
"dataset_123",
|
||||
cases
|
||||
)
|
||||
|
||||
assert success_count == 3
|
||||
assert failure_count == 0
|
||||
|
||||
def test_import_test_cases_with_failures(self, mock_evaluation_service):
|
||||
"""Test bulk import with some failures"""
|
||||
cases = [
|
||||
{"question": "Question 1"},
|
||||
{"question": ""}, # Invalid
|
||||
{"question": "Question 3"}
|
||||
]
|
||||
mock_evaluation_service.import_test_cases.return_value = (2, 1)
|
||||
|
||||
success_count, failure_count = mock_evaluation_service.import_test_cases(
|
||||
"dataset_123",
|
||||
cases
|
||||
)
|
||||
|
||||
assert success_count == 2
|
||||
assert failure_count == 1
|
||||
|
||||
|
||||
class TestEvaluationExecution:
|
||||
"""Tests for evaluation execution"""
|
||||
|
||||
@pytest.fixture
|
||||
def mock_evaluation_service(self):
|
||||
"""Create a mock EvaluationService"""
|
||||
with patch('api.db.services.evaluation_service.EvaluationService') as mock:
|
||||
yield mock
|
||||
|
||||
def test_start_evaluation_success(self, mock_evaluation_service):
|
||||
"""Test successful evaluation start"""
|
||||
mock_evaluation_service.start_evaluation.return_value = (True, "run_123")
|
||||
|
||||
success, run_id = mock_evaluation_service.start_evaluation(
|
||||
dataset_id="dataset_123",
|
||||
dialog_id="dialog_456",
|
||||
user_id="user_1"
|
||||
)
|
||||
|
||||
assert success is True
|
||||
assert run_id == "run_123"
|
||||
|
||||
def test_start_evaluation_with_invalid_dialog(self, mock_evaluation_service):
|
||||
"""Test starting evaluation with invalid dialog"""
|
||||
mock_evaluation_service.start_evaluation.return_value = (False, "Dialog not found")
|
||||
|
||||
success, error = mock_evaluation_service.start_evaluation(
|
||||
dataset_id="dataset_123",
|
||||
dialog_id="nonexistent",
|
||||
user_id="user_1"
|
||||
)
|
||||
|
||||
assert success is False
|
||||
assert "dialog" in error.lower()
|
||||
|
||||
def test_start_evaluation_with_custom_name(self, mock_evaluation_service):
|
||||
"""Test starting evaluation with custom name"""
|
||||
mock_evaluation_service.start_evaluation.return_value = (True, "run_123")
|
||||
|
||||
success, run_id = mock_evaluation_service.start_evaluation(
|
||||
dataset_id="dataset_123",
|
||||
dialog_id="dialog_456",
|
||||
user_id="user_1",
|
||||
name="My Custom Evaluation"
|
||||
)
|
||||
|
||||
assert success is True
|
||||
|
||||
def test_get_run_results(self, mock_evaluation_service):
|
||||
"""Test getting evaluation run results"""
|
||||
expected_results = {
|
||||
"run": {
|
||||
"id": "run_123",
|
||||
"status": "COMPLETED",
|
||||
"metrics_summary": {
|
||||
"avg_precision": 0.85,
|
||||
"avg_recall": 0.78
|
||||
}
|
||||
},
|
||||
"results": [
|
||||
{"case_id": "case_1", "metrics": {"precision": 0.9}},
|
||||
{"case_id": "case_2", "metrics": {"precision": 0.8}}
|
||||
]
|
||||
}
|
||||
mock_evaluation_service.get_run_results.return_value = expected_results
|
||||
|
||||
results = mock_evaluation_service.get_run_results("run_123")
|
||||
|
||||
assert results["run"]["id"] == "run_123"
|
||||
assert len(results["results"]) == 2
|
||||
|
||||
def test_get_run_results_not_found(self, mock_evaluation_service):
|
||||
"""Test getting results for non-existent run"""
|
||||
mock_evaluation_service.get_run_results.return_value = {}
|
||||
|
||||
results = mock_evaluation_service.get_run_results("nonexistent")
|
||||
|
||||
assert results == {}
|
||||
|
||||
|
||||
class TestEvaluationMetrics:
|
||||
"""Tests for metrics computation"""
|
||||
|
||||
@pytest.fixture
|
||||
def mock_evaluation_service(self):
|
||||
"""Create a mock EvaluationService"""
|
||||
with patch('api.db.services.evaluation_service.EvaluationService') as mock:
|
||||
yield mock
|
||||
|
||||
def test_compute_retrieval_metrics_perfect_match(self, mock_evaluation_service):
|
||||
"""Test retrieval metrics with perfect match"""
|
||||
retrieved_ids = ["chunk_1", "chunk_2", "chunk_3"]
|
||||
relevant_ids = ["chunk_1", "chunk_2", "chunk_3"]
|
||||
|
||||
expected_metrics = {
|
||||
"precision": 1.0,
|
||||
"recall": 1.0,
|
||||
"f1_score": 1.0,
|
||||
"hit_rate": 1.0,
|
||||
"mrr": 1.0
|
||||
}
|
||||
mock_evaluation_service._compute_retrieval_metrics.return_value = expected_metrics
|
||||
|
||||
metrics = mock_evaluation_service._compute_retrieval_metrics(retrieved_ids, relevant_ids)
|
||||
|
||||
assert metrics["precision"] == 1.0
|
||||
assert metrics["recall"] == 1.0
|
||||
assert metrics["f1_score"] == 1.0
|
||||
|
||||
def test_compute_retrieval_metrics_partial_match(self, mock_evaluation_service):
|
||||
"""Test retrieval metrics with partial match"""
|
||||
retrieved_ids = ["chunk_1", "chunk_2", "chunk_4", "chunk_5"]
|
||||
relevant_ids = ["chunk_1", "chunk_2", "chunk_3"]
|
||||
|
||||
expected_metrics = {
|
||||
"precision": 0.5, # 2 out of 4 retrieved are relevant
|
||||
"recall": 0.67, # 2 out of 3 relevant were retrieved
|
||||
"f1_score": 0.57,
|
||||
"hit_rate": 1.0, # At least one relevant was retrieved
|
||||
"mrr": 1.0 # First retrieved is relevant
|
||||
}
|
||||
mock_evaluation_service._compute_retrieval_metrics.return_value = expected_metrics
|
||||
|
||||
metrics = mock_evaluation_service._compute_retrieval_metrics(retrieved_ids, relevant_ids)
|
||||
|
||||
assert metrics["precision"] < 1.0
|
||||
assert metrics["recall"] < 1.0
|
||||
assert metrics["hit_rate"] == 1.0
|
||||
|
||||
def test_compute_retrieval_metrics_no_match(self, mock_evaluation_service):
|
||||
"""Test retrieval metrics with no match"""
|
||||
retrieved_ids = ["chunk_4", "chunk_5", "chunk_6"]
|
||||
relevant_ids = ["chunk_1", "chunk_2", "chunk_3"]
|
||||
|
||||
expected_metrics = {
|
||||
"precision": 0.0,
|
||||
"recall": 0.0,
|
||||
"f1_score": 0.0,
|
||||
"hit_rate": 0.0,
|
||||
"mrr": 0.0
|
||||
}
|
||||
mock_evaluation_service._compute_retrieval_metrics.return_value = expected_metrics
|
||||
|
||||
metrics = mock_evaluation_service._compute_retrieval_metrics(retrieved_ids, relevant_ids)
|
||||
|
||||
assert metrics["precision"] == 0.0
|
||||
assert metrics["recall"] == 0.0
|
||||
assert metrics["hit_rate"] == 0.0
|
||||
|
||||
def test_compute_summary_metrics(self, mock_evaluation_service):
|
||||
"""Test summary metrics computation"""
|
||||
results = [
|
||||
{"metrics": {"precision": 0.9, "recall": 0.8}, "execution_time": 1.2},
|
||||
{"metrics": {"precision": 0.8, "recall": 0.7}, "execution_time": 1.5},
|
||||
{"metrics": {"precision": 0.85, "recall": 0.75}, "execution_time": 1.3}
|
||||
]
|
||||
|
||||
expected_summary = {
|
||||
"total_cases": 3,
|
||||
"avg_execution_time": 1.33,
|
||||
"avg_precision": 0.85,
|
||||
"avg_recall": 0.75
|
||||
}
|
||||
mock_evaluation_service._compute_summary_metrics.return_value = expected_summary
|
||||
|
||||
summary = mock_evaluation_service._compute_summary_metrics(results)
|
||||
|
||||
assert summary["total_cases"] == 3
|
||||
assert summary["avg_precision"] > 0.8
|
||||
|
||||
|
||||
class TestEvaluationRecommendations:
|
||||
"""Tests for configuration recommendations"""
|
||||
|
||||
@pytest.fixture
|
||||
def mock_evaluation_service(self):
|
||||
"""Create a mock EvaluationService"""
|
||||
with patch('api.db.services.evaluation_service.EvaluationService') as mock:
|
||||
yield mock
|
||||
|
||||
def test_get_recommendations_low_precision(self, mock_evaluation_service):
|
||||
"""Test recommendations for low precision"""
|
||||
recommendations = [
|
||||
{
|
||||
"issue": "Low Precision",
|
||||
"severity": "high",
|
||||
"suggestions": [
|
||||
"Increase similarity_threshold",
|
||||
"Enable reranking"
|
||||
]
|
||||
}
|
||||
]
|
||||
mock_evaluation_service.get_recommendations.return_value = recommendations
|
||||
|
||||
recs = mock_evaluation_service.get_recommendations("run_123")
|
||||
|
||||
assert len(recs) > 0
|
||||
assert any("precision" in r["issue"].lower() for r in recs)
|
||||
|
||||
def test_get_recommendations_low_recall(self, mock_evaluation_service):
|
||||
"""Test recommendations for low recall"""
|
||||
recommendations = [
|
||||
{
|
||||
"issue": "Low Recall",
|
||||
"severity": "high",
|
||||
"suggestions": [
|
||||
"Increase top_k",
|
||||
"Lower similarity_threshold"
|
||||
]
|
||||
}
|
||||
]
|
||||
mock_evaluation_service.get_recommendations.return_value = recommendations
|
||||
|
||||
recs = mock_evaluation_service.get_recommendations("run_123")
|
||||
|
||||
assert len(recs) > 0
|
||||
assert any("recall" in r["issue"].lower() for r in recs)
|
||||
|
||||
def test_get_recommendations_slow_response(self, mock_evaluation_service):
|
||||
"""Test recommendations for slow response time"""
|
||||
recommendations = [
|
||||
{
|
||||
"issue": "Slow Response Time",
|
||||
"severity": "medium",
|
||||
"suggestions": [
|
||||
"Reduce top_k",
|
||||
"Optimize embedding model"
|
||||
]
|
||||
}
|
||||
]
|
||||
mock_evaluation_service.get_recommendations.return_value = recommendations
|
||||
|
||||
recs = mock_evaluation_service.get_recommendations("run_123")
|
||||
|
||||
assert len(recs) > 0
|
||||
assert any("response" in r["issue"].lower() or "slow" in r["issue"].lower() for r in recs)
|
||||
|
||||
def test_get_recommendations_no_issues(self, mock_evaluation_service):
|
||||
"""Test recommendations when metrics are good"""
|
||||
mock_evaluation_service.get_recommendations.return_value = []
|
||||
|
||||
recs = mock_evaluation_service.get_recommendations("run_123")
|
||||
|
||||
assert len(recs) == 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
Reference in New Issue
Block a user