Skip to content

edspdf.aggregators.styled

StyledAggregator

Bases: SimpleAggregator

Aggregator that returns text and styles.

Source code in edspdf/aggregators/styled.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
@registry.aggregators.register("styled.v1")
class StyledAggregator(SimpleAggregator):
    """
    Aggregator that returns text and styles.
    """

    def aggregate(
        self, lines: pd.DataFrame
    ) -> Tuple[Dict[str, str], Dict[str, List[Dict]]]:

        if len(lines) == 0:
            return {}, {}

        lines = lines.sort_values(["page", "y1", "x0"])
        lines["label"] = lines["label"].map(lambda l: self.label_map.get(l, l))

        lines["line_id"] = range(len(lines))

        styles = lines[["line_id", "styles"]].explode("styles").dropna().reset_index()
        styles = styles[["line_id"]].join(pd.json_normalize(styles.styles))

        lines = prepare_newlines(
            lines,
            nl_threshold=self.nl_threshold,
            np_threshold=self.np_threshold,
        )

        lines["offset"] = lines["text_with_newline"].str.len()
        lines["offset"] = lines.groupby(["label"])["offset"].transform("cumsum")
        lines["offset"] = lines.groupby(["label"])["offset"].transform("shift")
        lines["offset"] = lines["offset"].fillna(0).astype(int)

        styles = styles.merge(lines[["line_id", "offset", "label"]], on="line_id")
        styles["start"] += styles.offset
        styles["end"] += styles.offset

        df = lines.groupby(["label"]).agg(text=("text_with_newline", "sum"))

        text = df.text.to_dict()
        style = {
            label: styles.query("label == @label")
            .drop(columns=["line_id", "offset", "label"])
            .to_dict(orient="records")
            for label in text.keys()
        }

        return text, style