Skip to content

edspdf.classifiers.align

align_labels(lines, labels, threshold=0.0001)

Align lines with possibly overlapping (and non-exhaustive) labels.

Possible matches are sorted by covered area. Lines with no overlap at all

PARAMETER DESCRIPTION
lines

DataFrame containing the lines

TYPE: pd.DataFrame

labels

DataFrame containing the labels

TYPE: pd.DataFrame

threshold

Threshold to use for discounting a label. Used if the labels DataFrame does not provide a threshold column, or to fill NaN values thereof.

TYPE: float, default 1 DEFAULT: 0.0001

RETURNS DESCRIPTION
pd.DataFrame

A copy of the lines table, with the labels added.

Source code in edspdf/classifiers/align.py
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
def align_labels(
    lines: pd.DataFrame,
    labels: pd.DataFrame,
    threshold: float = 0.0001,
) -> pd.DataFrame:
    """
    Align lines with possibly overlapping (and non-exhaustive) labels.

    Possible matches are sorted by covered area. Lines with no overlap at all

    Parameters
    ----------
    lines : pd.DataFrame
        DataFrame containing the lines
    labels : pd.DataFrame
        DataFrame containing the labels
    threshold : float, default 1
        Threshold to use for discounting a label. Used if the `labels` DataFrame
        does not provide a `threshold` column, or to fill `NaN` values thereof.

    Returns
    -------
    pd.DataFrame
        A copy of the lines table, with the labels added.
    """

    lines["uid"] = range(len(lines))

    df = lines[
        sorted({"uid", "page", "x0", "y0", "x1", "y1"} & set(lines.columns))
    ].copy()
    labels = labels.copy()

    if "threshold" not in labels.columns:
        labels["threshold"] = threshold

    labels.threshold = labels.threshold.fillna(threshold)

    df = df.merge(
        labels, how="inner" if set(df.columns) & set(labels.columns) else "cross"
    )

    df["dx"] = df[["x1", "X1"]].min(axis=1) - df[["x0", "X0"]].max(axis=1)
    df["dy"] = df[["y1", "Y1"]].min(axis=1) - df[["y0", "Y0"]].max(axis=1)

    df["overlap"] = (df.dx > 0) * (df.dy > 0) * df.dx * df.dy

    df["area"] = (df.x1 - df.x0) * (df.y1 - df.y0)
    df["ratio"] = df.overlap / df.area

    df["area_mask"] = (df.X1 - df.X0) * (df.Y1 - df.Y0)
    df["ratio_mask"] = df.overlap / df.area_mask

    df["thresholded"] = df.ratio >= df.threshold

    df = df.sort_values(["thresholded", "ratio_mask"], ascending=False)

    df = df.groupby(["uid"], as_index=False).first()
    df = df.sort_values("uid").reset_index(drop=True)

    df.label = df.label.where(df.thresholded)

    df = lines.merge(df[["uid", "label"]], on="uid").drop(columns=["uid"])
    lines.drop(columns="uid", inplace=True)

    return df