Skip to content

edspdf.aggregators

styled

StyledAggregator

Bases: SimpleAggregator

Aggregator that returns text and styles.

Source code in edspdf/aggregators/styled.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
@registry.aggregators.register("styled.v1")
class StyledAggregator(SimpleAggregator):
    """
    Aggregator that returns text and styles.
    """

    def aggregate(
        self, lines: pd.DataFrame
    ) -> Tuple[Dict[str, str], Dict[str, List[Dict]]]:

        lines = lines.sort_values(["page", "y1", "x0"])

        lines["line_id"] = range(len(lines))

        styles = lines[["line_id", "styles"]].explode("styles").dropna().reset_index()
        styles = styles[["line_id"]].join(pd.json_normalize(styles.styles))

        lines = prepare_newlines(
            lines,
            nl_threshold=self.nl_threshold,
            np_threshold=self.np_threshold,
        )

        lines["offset"] = lines["text_with_newline"].str.len()
        lines["offset"] = lines.groupby(["label"])["offset"].transform("cumsum")
        lines["offset"] = lines.groupby(["label"])["offset"].transform("shift")
        lines["offset"] = lines["offset"].fillna(0).astype(int)

        styles = styles.merge(lines[["line_id", "offset", "label"]], on="line_id")
        styles["start"] += styles.offset
        styles["end"] += styles.offset

        df = lines.groupby(["label"]).agg(text=("text_with_newline", "sum"))

        text = df.text.to_dict()
        style = {
            label: styles.query("label == @label")
            .drop(columns=["line_id", "offset", "label"])
            .to_dict(orient="records")
            for label in text.keys()
        }

        return text, style

base

BaseAggregator

Bases: ABC

Source code in edspdf/aggregators/base.py
 7
 8
 9
10
11
12
13
14
15
16
17
class BaseAggregator(ABC):
    @abstractmethod
    def aggregate(self, lines: pd.DataFrame) -> Dict[str, str]:
        """
        Handles the text aggregation
        """

    def __call__(self, lines: pd.DataFrame, copy: bool = False) -> Dict[str, str]:
        if copy:
            lines = lines.copy()
        return self.aggregate(lines)

aggregate(lines) abstractmethod

Handles the text aggregation

Source code in edspdf/aggregators/base.py
 8
 9
10
11
12
@abstractmethod
def aggregate(self, lines: pd.DataFrame) -> Dict[str, str]:
    """
    Handles the text aggregation
    """