Skip to content

edspdf.components.aggregators.styled

StyledAggregator

Bases: SimpleAggregator

Aggregator that returns text and styles.

Source code in edspdf/components/aggregators/styled.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
@registry.factory.register("styled-aggregator")
class StyledAggregator(SimpleAggregator):
    """
    Aggregator that returns text and styles.
    """

    def __call__(self, doc: PDFDoc) -> Tuple[Dict[str, str], Dict[str, List[Dict]]]:

        row_height = sum(b.y1 - b.y0 for b in doc.lines) / max(1, len(doc.lines))
        all_lines = sorted(
            [
                line
                for line in doc.lines
                if len(line.text) > 0 and line.label is not None
            ],
            key=lambda b: (b.label, b.page, b.y1 // row_height, b.x0),
        )

        texts = {}
        styles = {}
        for label, lines in groupby(all_lines, key=lambda b: b.label):
            styles[label] = []
            text = ""
            lines: List[TextBox] = list(lines)
            pairs = list(zip(lines, [*lines[1:], None]))
            dys = [
                next_box.y1 - line.y1
                if next_box is not None and line.page == next_box.page
                else None
                for line, next_box in pairs
            ]
            height = np.median(np.asarray([line.y1 - line.y0 for line in lines]))
            for (line, next_box), dy in zip(pairs, dys):
                for style in line.styles:
                    style_dict = style.dict()
                    style_dict["begin"] += len(text)
                    style_dict["end"] += len(text)
                    styles[label].append(style_dict)
                text = text + line.text
                if next_box is None:
                    continue
                if line.page != next_box.page:
                    text = text + "\n\n"
                elif dy / height > self.new_paragraph_threshold:
                    text = text + "\n\n"
                elif dy / height > self.new_line_threshold:
                    text = text + "\n"
                else:
                    text = text + " "
            texts[label] = "".join(text)

        return texts, styles