I'm working with a bunch of yaml files that are rather sloppy. This is outside of my control unfortunately, so I'm trying to clean up the data after the fact. My approach doesn't allow for nested structures yet, but I was wondering if this approach seems reasonable so far, if the code is readable, and if there are suggestions to improve the code.
from dataclasses import dataclass, field, fields
from enum import auto, Enum
from pathlib import Path
from typing import Any
import yaml
class HandleValidationError(Enum):
"""
An enumeration representing options for handling validation errors in the MappedYaml class.
Enum Values:
ignore: Ignore validation errors and proceed with the invalid data.
reject: Reject the invalid data and exclude it from further processing.
correct: Attempt to correct the invalid data and continue with the corrected values.
"""
ignore = auto()
reject = auto()
correct = auto()
class StatusValidationError(Enum):
"""
An enumeration representing the status of validation errors in the MappedYaml class.
Enum Values:
ignored: The validation error was ignored.
rejected: The validation error led to rejection of the invalid data.
correct_fail: Attempted correction of the invalid data failed.
correct_success: Attempted correction of the invalid data succeeded.
"""
ignored = auto()
rejected = auto()
correct_fail = auto()
correct_success = auto()
@dataclass
class ValidationErrorLog:
"""
A class to store validation error logs for YAML data in the MappedYaml class.
"""
missing_key: list[str] = field(default_factory=list)
without_value: list[str] = field(default_factory=list)
incorrectly_formatted_value: list[tuple[str, StatusValidationError]] = field(default_factory=list)
invalid_value_type: list[tuple[str, StatusValidationError]] = field(default_factory=list)
def add_missing_key(self, missing_key: str) -> None:
self.missing_key.append(missing_key)
def add_key_without_value(self, key_without_value: str) -> None:
self.without_value.append(key_without_value)
def add_key_with_incorrectly_formatted_value(self, key_with_incorrectly_formatted_value: str, status: StatusValidationError) -> None:
self.incorrectly_formatted_value.append((key_with_incorrectly_formatted_value, status))
def add_key_with_incorrect_value_type(self, key_with_invalid_value_type: str, status: StatusValidationError) -> None:
self.invalid_value_type.append((key_with_invalid_value_type, status))
@dataclass(kw_only=True)
class MappedYaml:
"""
A base class for mapping YAML data to Python objects with validation.
"""
validation_error_log: ValidationErrorLog = None
@classmethod
def init_with_validation(
cls,
path: Path,
handle_type_errors: HandleValidationError = HandleValidationError.correct,
handle_formatting_errors: HandleValidationError = HandleValidationError.correct
) -> 'MappedYaml':
"""
Initialize an instance of the class with validation.
Args:
path (Path): The path to the YAML file.
handle_type_errors (HandleValidationError): How to handle errors related to invalid value types.
handle_formatting_errors (HandleValidationError): How to handle errors related to invalid formatting.
Returns:
MappedYaml: An instance of the class with validated data.
Raises:
ValueError: If the file has an invalid suffix.
TypeError: If the child class cannot be initialized.
"""
if not path.suffix == '.yaml':
raise ValueError(f'File "{path.as_posix()}" has invalid suffix. Only .yaml files are supported.')
log = cls._new_log()
data = cls._get_data(path, log)
cls._check_invalid_types(data, log, handle_type_errors)
cls._check_invalid_formatting(data, log, handle_formatting_errors)
try:
return cls(validation_error_log=log, **data)
except TypeError:
raise TypeError('Could not initialize child class. Check if the child class has default values for '
'every field.')
@classmethod
def _new_log(cls) -> ValidationErrorLog:
"""
Create a new instance of the validation error log. Override this method if you wish to use a custom version
of ValidationErrorLog().
"""
return ValidationErrorLog()
@classmethod
def _get_data(
cls,
path: Path,
log: ValidationErrorLog
) -> dict[str, Any]:
"""
Attempt to retrieve the data from a YAML file, as defined in the fields of the child class. Logs missing keys
and values from the Yaml file.
"""
with open(path) as file:
all_data = yaml.safe_load(file)
result = {}
for f in fields(cls)[1:]:
try:
data_value = all_data[f.name]
except KeyError:
log.add_missing_key(f.name)
else:
if not data_value:
log.add_key_without_value(f.name)
result[f.name] = data_value
return result
@classmethod
def _check_invalid_types(
cls,
data: dict[str, Any],
log: ValidationErrorLog,
handle_type_errors: HandleValidationError
) -> None:
"""
Checks if the value types of the yaml data aligns with the field types declared in the fields of
the child class.
"""
required_types = {f.name: f.type for f in fields(cls)}
keys_to_remove = []
for data_key, data_value in data.items():
required_type = required_types.get(data_key)
if data_value and not isinstance(data_value, required_type):
status = cls._handle_invalid_type(data, data_key, required_type, handle_type_errors)
log.add_key_with_incorrect_value_type(data_key, status)
if status in {StatusValidationError.rejected, StatusValidationError.correct_fail}:
keys_to_remove.append(data_key)
for key in keys_to_remove:
data.pop(key)
@classmethod
def _handle_invalid_type(
cls,
data: dict[str, Any],
data_key: str,
required_type: type,
handle_type_errors: HandleValidationError
) -> StatusValidationError:
"""
Handle cases where a value has an invalid type.
"""
if handle_type_errors == HandleValidationError.ignore:
return StatusValidationError.ignored
elif handle_type_errors == HandleValidationError.reject:
return StatusValidationError.rejected
else:
try:
data[data_key] = required_type(data[data_key])
return StatusValidationError.correct_success
except ValueError:
return StatusValidationError.correct_fail
@classmethod
def _check_invalid_formatting(
cls,
data: dict[str, Any],
log: ValidationErrorLog,
handle_formatting_errors: HandleValidationError
) -> None:
"""
Check for invalid formatting in the data. Implement to add custom formatting checks for your data.
"""
pass
# example of a child class:
@dataclass(kw_only=True)
class StudyYamlData(MappedYaml):
study_id: str = ''
year: int = None
abstract: str = ''
reference: str = ''
keywords: str = ''
domain_general: str = ''
domain_specific: str = ''
1 Answer 1
nouns and verbs
We use nouns for class names and verbs for function (or method) names.
class HandleValidationError(Enum):
"""
An enumeration representing options for handling validation errors in the MappedYaml class.
That looks more like a ValidationAction
to me (or perhaps a "recovery type").
Grammar in the docstring is perfect -- other code will be handling each error.
But this code is not about "handling" errors and fixing them,
it's about attaching a descriptive tag.
nit: To "correct" horribly bad data is sometimes impossible. Consider using "impute" instead.
Using an Enum here was very nice. I will just observe in passing that we don't necessarily need that level of indirection. We could choose to mention class names directly, so we're pointing directly at the code which implements the recovery strategy.
errors are exceptions
class StatusValidationError(Enum):
Yikes! Up above I somehow knew the name was "wrong", but I didn't even realize exactly why I felt that so strongly. But now the pattern becomes clearer. Yeah, we have FooError which inherits from Enum rather than Exception -- that's pretty surprising! Please don't do that.
I guess each of these is a ValidationErrorStatus
?
And perhaps the previous class is ValidationErrorStrategy
?
Sorry. It turns out that Naming is Hard. But it's really important, so we strive to get it (mostly) right.
I worry that this class seems mostly redundant with the strategy class,
except for a bool
on whether imputation succeeded or not.
Maybe failed imputation replaces input with a BAD_DATA sentinel
and then we don't need these four enums?
I will read further to see how that holds up.
dataclass methods
Yay, ValidationErrorLog
is a @dataclass
!
Other name suffixes to consider would be ...Store
and ...Repository
,
but ...Log
is perfectly nice as-is.
I really like the four fields, and their very clear signatures.
The four methods seem on the verbose side, like I'm reading java rather than python. Maybe we need them? We'll see. But given that your Public API defines four public fields, it's unclear why caller shouldn't just do the .append() himself. Not seeing much value add in these methods yet.
Well, certainly it's very clean!
types in docstrings
In MappedYaml init_with_validation
,
the optional type annotation is lovely, I thank you.
The Args section of the docstring seems mostly
redundant with the excellent parameter names you chose,
especially the types which repeat types seen in the signature.
Maybe some people find a list of known exceptions helpful? This isn't java. I figure lightning can strike at any moment, there's no knowing exactly what might happen during a call. As long as we have informative diagnostic error messages, which I'm seeing here, then we're good.
Consider renaming the first param to be slightly more verbose: yaml_path
docstring format
Tiny nit: in _new_log
, we see a very nice docstring.
Pep-8 asks that you write the first sentence and then
insert a blank line before going on with the helpful "override" advice.
tricky code
for f in fields(cls)[1:]:
In _get_data
it's not clear what's special
about the zeroth field.
Especially since MappedYaml doesn't have any fields at all.
The helpful example class seems to indicate that an "...id" might be special. We should find a way to document this. Maybe it makes sense to filter() fields that match an "id" regex? Or maybe "optional field" is the concept we're going for here? Or "ignored field"?
The if not data_value:
test is maybe looser than desired,
since many things can be falsey.
Or maybe the current behavior is exactly what you want,
and it just needs some better documentation.
I'm concerned that someone could run a YAML file through
this parser, be happy with it, then make a tiny edit
and be surprised by the parse result.
checking for valid types
In _check_invalid_types
, here's a small nit:
if status in {StatusValidationError.rejected,
StatusValidationError.correct_fail}:
This is very nice as-is.
I think you were going for speed -- usually we prefer to
use the in
operator on a set
than a sequence.
It's a micro-optimization.
We're constructing a new set
and then soon discarding it.
I haven't viewed the python3.12 bytecode,
but usually dis.dis
reveals that in (a, b)
does less work than if we construct a mutable sequence.
It gets to put the immutable tuple into the bytecode, once,
and then keep recycling it.
No biggie.
This is a nice method, very clear. Here's an algorithmic idea: maybe delete it? And let beartype worry about the heavy lifting? Then it's just a matter of catching any exceptions that it raises. Ok, maybe it's a bit late in the design process to contemplate merging in beartype. I just like how it consults type annotations.
default behavior
I found _check_invalid_formatting
surprisingly short.
Perhaps we'd like to offer some example behavior,
like prohibiting values longer than a megabyte,
as an example for future implementors?
That brings me back to _check_invalid_types
,
which also is void (returns -> None:
).
I kind of wish that both docstrings were more
explicit about "evaluate for side effects",
and then spell out the kinds of side effects contemplated.
If the Author's Intent is that both are likely
to be overriden in a child class according to
needs of a custom use case, perhaps we'd prefer public
method names rather than _private
?
Anyway, overall I find these are well engineered classes that do a good job of describing the details. I've written a couple three things about them, but that's more a reflection on me being chatty than upon code quality. I would use this as-is.
This codebase achieves its design goals.
I would be willing to delegate or accept maintenance tasks on it.
-
\$\begingroup\$ Thank you, excellent advice as always. I'm going to play around with beartype - even if I might not end up using it, it was already worth it for the beartype FAQ alone. π \$\endgroup\$Jan van Wijk– Jan van Wijk2023εΉ΄11ζ19ζ₯ 20:42:18 +00:00Commented Nov 19, 2023 at 20:42