Skip to content

edspdf.classifiers

align

align_labels(lines, labels, threshold=0.0001)

Align lines with possibly overlapping (and non-exhaustive) labels.

Possible matches are sorted by covered area. Lines with no overlap at all

PARAMETER DESCRIPTION
lines

DataFrame containing the lines

TYPE: pd.DataFrame

labels

DataFrame containing the labels

TYPE: pd.DataFrame

threshold

Threshold to use for discounting a label. Used if the labels DataFrame does not provide a threshold column, or to fill NaN values thereof.

TYPE: float, default 1 DEFAULT: 0.0001

RETURNS DESCRIPTION
pd.DataFrame

A copy of the lines table, with the labels added.

Source code in edspdf/classifiers/align.py
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
def align_labels(
    lines: pd.DataFrame,
    labels: pd.DataFrame,
    threshold: float = 0.0001,
) -> pd.DataFrame:
    """
    Align lines with possibly overlapping (and non-exhaustive) labels.

    Possible matches are sorted by covered area. Lines with no overlap at all

    Parameters
    ----------
    lines : pd.DataFrame
        DataFrame containing the lines
    labels : pd.DataFrame
        DataFrame containing the labels
    threshold : float, default 1
        Threshold to use for discounting a label. Used if the `labels` DataFrame
        does not provide a `threshold` column, or to fill `NaN` values thereof.

    Returns
    -------
    pd.DataFrame
        A copy of the lines table, with the labels added.
    """

    lines["uid"] = range(len(lines))

    df = lines[
        sorted({"uid", "page", "x0", "y0", "x1", "y1"} & set(lines.columns))
    ].copy()
    labels = labels.copy()

    if "threshold" not in labels.columns:
        labels["threshold"] = threshold

    labels.threshold = labels.threshold.fillna(threshold)

    df = df.merge(
        labels, how="inner" if set(df.columns) & set(labels.columns) else "cross"
    )

    df["dx"] = df[["x1", "X1"]].min(axis=1) - df[["x0", "X0"]].max(axis=1)
    df["dy"] = df[["y1", "Y1"]].min(axis=1) - df[["y0", "Y0"]].max(axis=1)

    df["overlap"] = (df.dx > 0) * (df.dy > 0) * df.dx * df.dy

    df["area"] = (df.x1 - df.x0) * (df.y1 - df.y0)
    df["ratio"] = df.overlap / df.area

    df["area_mask"] = (df.X1 - df.X0) * (df.Y1 - df.Y0)
    df["ratio_mask"] = df.overlap / df.area_mask

    df["thresholded"] = df.ratio >= df.threshold

    df = df.sort_values(["thresholded", "ratio_mask"], ascending=False)

    df = df.groupby(["uid"], as_index=False).first()
    df = df.sort_values("uid").reset_index(drop=True)

    df.label = df.label.where(df.thresholded)

    df = lines.merge(df[["uid", "label"]], on="uid").drop(columns=["uid"])
    lines.drop(columns="uid", inplace=True)

    return df

random

RandomClassifier

Bases: BaseClassifier

Random classifier, for chaos purposes. Classifies each line to a random element.

Source code in edspdf/classifiers/random.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
@registry.classifiers.register("random.v1")
class RandomClassifier(BaseClassifier):
    """
    Random classifier, for chaos purposes. Classifies each line to a random element.
    """

    def __init__(
        self,
        classes: Union[List[str], Dict[str, float]],
        seed: Optional[int] = 0,
    ) -> None:

        if isinstance(classes, list):
            classes = {c: 1 for c in classes}

        self.classes = {c: w / sum(classes.values()) for c, w in classes.items()}

        self.rgn = np.random.default_rng(seed=seed)

    def predict(self, lines: pd.DataFrame) -> List[str]:
        choices = self.rgn.choice(
            list(self.classes.keys()),
            p=list(self.classes.values()),
            size=len(lines),
        )

        return list(choices)

mask

MaskClassifier

Bases: BaseClassifier

Mask classifier, that reproduces the PdfBox behaviour.

Source code in edspdf/classifiers/mask.py
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
class MaskClassifier(BaseClassifier):
    """
    Mask classifier, that reproduces the PdfBox behaviour.
    """

    def __init__(
        self,
        *ms: Mask,
    ) -> None:

        masks = list(ms)

        masks.append(Mask(label="pollution"))

        self.comparison = pd.DataFrame.from_records([mask.dict() for mask in masks])

    def predict(self, lines: pd.DataFrame) -> pd.Series:

        df = align_labels(lines, self.comparison)

        return df.label

dummy

DummyClassifier

Bases: BaseClassifier

"Dummy" classifier, for testing purposes. Classifies every line to body.

Source code in edspdf/classifiers/dummy.py
10
11
12
13
14
15
16
17
@registry.classifiers.register("dummy.v1")
class DummyClassifier(BaseClassifier):
    """
    "Dummy" classifier, for testing purposes. Classifies every line to ``body``.
    """

    def predict(self, lines: pd.DataFrame) -> List[str]:
        return ["body"] * len(lines)

base

BaseClassifier

Bases: ABC

Source code in edspdf/classifiers/base.py
 7
 8
 9
10
11
12
13
14
15
class BaseClassifier(ABC):
    @abstractmethod
    def predict(self, lines: pd.DataFrame) -> List[str]:
        """
        Handles the classification.
        """

    def __call__(self, lines: pd.DataFrame) -> List[str]:
        return self.predict(lines)

predict(lines) abstractmethod

Handles the classification.

Source code in edspdf/classifiers/base.py
 8
 9
10
11
12
@abstractmethod
def predict(self, lines: pd.DataFrame) -> List[str]:
    """
    Handles the classification.
    """