Compute a data quality score for an ML pipeline. Given a dataset (list of dicts), evaluate multiple quality dimensions — completeness, uniqueness of a key column, value-range validity, and type consistency — then return an aggregate score between 0 and 1.
Score each dimension independently as a ratio and return the weighted average.
def data_quality_score(
data: list[dict],
key_column: str,
numeric_columns: list[str] | None = None,
valid_ranges: dict | None = None,
weights: dict | None = None,
) -> dict:
if not data:
return {"overall": 0.0, "completeness": 0.0, "uniqueness": 0.0,
"validity": 0.0, "consistency": 0.0}
if weights is None:
weights = {"completeness": 0.3, "uniqueness": 0.2,
"validity": 0.25, "consistency": 0.25}
if numeric_columns is None:
numeric_columns = []
if valid_ranges is None:
valid_ranges = {}
n = len(data)
all_keys = set()
for row in data:
all_keys.update(row.keys())
# Completeness: fraction of non-None cells
total_cells = n * len(all_keys)
non_null = sum(1 for row in data for k in all_keys if row.get(k) is not None)
completeness = non_null / total_cells if total_cells else 1.0
# Uniqueness of key column
key_values = [row.get(key_column) for row in data if row.get(key_column) is not None]
uniqueness = len(set(key_values)) / len(key_values) if key_values else 0.0
# Validity: numeric values within expected ranges
valid_count, range_total = 0, 0
for col in numeric_columns:
lo, hi = valid_ranges.get(col, (float("-inf"), float("inf")))
for row in data:
v = row.get(col)
if v is not None:
range_total += 1
if lo <= v <= hi:
valid_count += 1
validity = valid_count / range_total if range_total else 1.0
# Consistency: type uniformity per column
consistent_cols = 0
for col in all_keys:
types = set(type(row[col]) for row in data if col in row and row[col] is not None)
if len(types) <= 1:
consistent_cols += 1
consistency = consistent_cols / len(all_keys) if all_keys else 1.0
overall = (
weights["completeness"] * completeness
+ weights["uniqueness"] * uniqueness
+ weights["validity"] * validity
+ weights["consistency"] * consistency
)
return {
"overall": round(overall, 4),
"completeness": round(completeness, 4),
"uniqueness": round(uniqueness, 4),
"validity": round(validity, 4),
"consistency": round(consistency, 4),
}