Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
129 changes: 115 additions & 14 deletions src/crawlee/statistics/_error_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,34 +2,135 @@

from __future__ import annotations

from collections import Counter
from dataclasses import dataclass
import traceback
from collections import Counter, defaultdict
from itertools import zip_longest
from typing import Union


@dataclass(frozen=True, unsafe_hash=True)
class ErrorGroup:
"""Identifies a group of similar errors."""

class_name: str | None
GroupName = Union[str, None]
ErrorFilenameGroups = dict[GroupName, dict[GroupName, Counter[GroupName]]]


class ErrorTracker:
"""Track errors and aggregates their counts by similarity."""

def __init__(self) -> None:
self._errors = Counter[ErrorGroup]()
def __init__(
self,
*,
show_error_name: bool = True,
show_file_and_line_number: bool = True,
show_error_message: bool = True,
show_full_message: bool = False,
) -> None:
self.show_error_name = show_error_name
self.show_file_and_line_number = show_file_and_line_number
self.show_error_message = show_error_message
if show_full_message and not show_error_message:
raise ValueError('`show_error_message` must be `True` if `show_full_message` is set to `True`')
self.show_full_message = show_full_message
self._errors: ErrorFilenameGroups = defaultdict(lambda: defaultdict(Counter))

def add(self, error: Exception) -> None:
"""Include an error in the statistics."""
error_group = ErrorGroup(class_name=error.__class__.__name__)
self._errors[error_group] += 1
error_group_name = error.__class__.__name__ if self.show_error_name else None
error_group_message = self._get_error_message(error)
error_group_file_and_line = self._get_file_and_line(error)

# First two levels are grouped only in case of exact match.
specific_groups = self._errors[error_group_file_and_line][error_group_name]

# Lowest level group is matched by similarity.
if error_group_message in specific_groups:
# Exact match.
specific_groups.update([error_group_message])
else:
for existing_error_group_message in specific_groups:
# Add to first group with similar text. Modify text with wildcard characters if necessary.
if new_error_group_message := self._create_generic_message(
existing_error_group_message, error_group_message
):
# Replace old name.
specific_groups[new_error_group_message] = specific_groups.pop(existing_error_group_message)
# Increment.
specific_groups.update([new_error_group_message])
break
else:
# No similar message found. Create new group.
self._errors[error_group_file_and_line][error_group_name].update([error_group_message])

def _get_file_and_line(self, error: Exception) -> str | None:
if self.show_file_and_line_number:
error_traceback = traceback.extract_tb(error.__traceback__)
return f'{error_traceback[0].filename.split("/")[-1]}:{error_traceback[0].lineno}'
return None

def _get_error_message(self, error: Exception) -> str | None:
if self.show_error_message:
if self.show_full_message:
return str(error.args[0])
return str(error.args[0]).split('\n')[0]
return None

@property
def unique_error_count(self) -> int:
"""Number of distinct kinds of errors."""
return len(self._errors)
unique_error_count = 0
for file_and_line_group in self._errors.values():
for name_group in file_and_line_group.values():
unique_error_count += len(name_group)
return unique_error_count

@property
def total(self) -> int:
"""Total number of errors."""
return sum(self._errors.values())
error_count = 0
for file_and_line_group in self._errors.values():
for name_group in file_and_line_group.values():
error_count += sum(name_group.values())
return error_count

def get_most_common_errors(self, n: int = 3) -> list[tuple[str | None, int]]:
"""Return n most common errors."""
all_errors: Counter[GroupName] = Counter()
for file_and_line_group_name, file_and_line_group in self._errors.items():
for name_group_name, name_group in file_and_line_group.items():
for message_group_name, count in name_group.items():
all_errors[self._get_error_repr(file_and_line_group_name, name_group_name, message_group_name)] = (
count
)
return all_errors.most_common(n)

def _get_error_repr(self, file_and_line: str | None, name: str | None, message: str | None) -> str:
"""Get the most specific error representation."""
file_and_line_part = f'{file_and_line}:' if file_and_line else ''
name_part = f'{name}:' if name else ''
message_part = f'{message}' if message else ''
return f'{file_and_line_part}{name_part}{message_part}'

@staticmethod
def _create_generic_message(message_1: str | None, message_2: str | None) -> str | None:
"""Create a generic error message from two messages, if they are similar enough.

Different parts of similar messages are replaced by `_`.
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm kinda surprised by the choice of _ here - I think it might get mixed up with legit underscores in python error messages.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe *** will be better in Python context

"""
if message_1 is None or message_2 is None:
return None

replacement_string = '***'
replacement_count = 0

generic_message_parts = []
message_1_parts = message_1.split(' ')
message_2_parts = message_2.split(' ')
parts_count = min(len(message_1_parts), len(message_2_parts))

for message_1_part, message_2_part in zip_longest(message_1_parts, message_2_parts, fillvalue=''):
if message_1_part != message_2_part:
generic_message_parts.append(replacement_string)
replacement_count += 1
if replacement_count >= parts_count / 2:
# Messages are too different.
return ''
else:
generic_message_parts.append(message_1_part)
return ' '.join(generic_message_parts)
100 changes: 100 additions & 0 deletions tests/unit/_statistics/test_error_tracker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
import traceback

import pytest

from crawlee.statistics._error_tracker import ErrorTracker


@pytest.mark.parametrize(
('error_tracker', 'expected_unique_errors'),
[
(ErrorTracker(), 4),
(ErrorTracker(show_file_and_line_number=False), 3),
(ErrorTracker(show_error_name=False), 3),
(ErrorTracker(show_error_message=False), 3),
(ErrorTracker(show_error_name=False, show_file_and_line_number=False), 2),
(ErrorTracker(show_file_and_line_number=False, show_error_message=False), 2),
(ErrorTracker(show_error_name=False, show_file_and_line_number=False, show_error_message=False), 1),
],
)
def test_error_tracker_counts(error_tracker: ErrorTracker, expected_unique_errors: int) -> None:
"""Use different settings of `error_tracker` and test unique errors count."""

for error in [
Exception('Some value error abc'),
ValueError('Some value error abc'), # Different type, different error
ValueError('Some value error cde'), # Same type and similar message to previous, considered the same.
ValueError(
'Another value error efg'
), # Same type, but too different message to previous, considered different.
]:
try:
raise error # Errors raised on same line
except Exception as e: # noqa:PERF203
error_tracker.add(e)

try:
raise ValueError('Some value error abc') # Same as one previous error, but different line.
except Exception as e:
error_tracker.add(e)

assert error_tracker.total == 5
assert error_tracker.unique_error_count == expected_unique_errors


@pytest.mark.parametrize(
('message_1', 'message_2', 'expected_generic_message'),
[
('Some error number 123', 'Some error number 456', 'Some error number ***'),
('Some error number 123 456', 'Some error number 123 456 789', 'Some error number 123 456 ***'),
('Some error number 0 0 0', 'Some error number 1 0 1', 'Some error number *** 0 ***'),
],
)
def test_error_tracker_similar_messages_full_stack(
message_1: str, message_2: str, expected_generic_message: str
) -> None:
"""Test that similar messages collapse into same group with generic name that contains wildcard symbols."""
error_tracker = ErrorTracker()
for error in [
KeyError(message_1),
KeyError(message_1),
KeyError(message_1),
ValueError(message_1),
ValueError(message_2),
RuntimeError(message_2),
]:
try:
raise error # Errors raised on the same line
except Exception as e: # noqa:PERF203
error_tracker.add(e)
line = traceback.extract_tb(e.__traceback__)[0].lineno

file_name = __file__.split('/')[-1]
errors = error_tracker.get_most_common_errors()
assert errors[0][0] == f'{file_name}:{line}:KeyError:{message_1}'
assert errors[0][1] == 3
assert errors[1][0] == f'{file_name}:{line}:ValueError:{expected_generic_message}'
assert errors[1][1] == 2
assert errors[2][0] == f'{file_name}:{line}:RuntimeError:{message_2}'
assert errors[2][1] == 1


@pytest.mark.parametrize(
('show_full_message', 'expected_message'),
[
(True, 'Error line 1\n Error line 2'),
(False, 'Error line 1'),
],
)
def test_show_full_message(*, show_full_message: bool, expected_message: str) -> None:
"""Test error message settings with both options of `show_full_message`."""
error_tracker = ErrorTracker(
show_error_name=False, show_file_and_line_number=False, show_full_message=show_full_message
)

try:
raise RuntimeError('Error line 1\n Error line 2') # Errors raised on the same line
except Exception as e:
error_tracker.add(e)

assert error_tracker.get_most_common_errors()[0][0] == expected_message
Loading