Bases: SimpleAggregator
Aggregator that returns text and styles.
Source code in edspdf/aggregators/styled.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57 | @registry.aggregators.register("styled.v1")
class StyledAggregator(SimpleAggregator):
"""
Aggregator that returns text and styles.
"""
def aggregate(
self, lines: pd.DataFrame
) -> Tuple[Dict[str, str], Dict[str, List[Dict]]]:
if len(lines) == 0:
return {}, {}
lines = lines.sort_values(["page", "y1", "x0"])
lines["label"] = lines["label"].map(lambda l: self.label_map.get(l, l))
lines["line_id"] = range(len(lines))
styles = lines[["line_id", "styles"]].explode("styles").dropna().reset_index()
styles = styles[["line_id"]].join(pd.json_normalize(styles.styles))
lines = prepare_newlines(
lines,
nl_threshold=self.nl_threshold,
np_threshold=self.np_threshold,
)
lines["offset"] = lines["text_with_newline"].str.len()
lines["offset"] = lines.groupby(["label"])["offset"].transform("cumsum")
lines["offset"] = lines.groupby(["label"])["offset"].transform("shift")
lines["offset"] = lines["offset"].fillna(0).astype(int)
styles = styles.merge(lines[["line_id", "offset", "label"]], on="line_id")
styles["start"] += styles.offset
styles["end"] += styles.offset
df = lines.groupby(["label"]).agg(text=("text_with_newline", "sum"))
text = df.text.to_dict()
style = {
label: styles.query("label == @label")
.drop(columns=["line_id", "offset", "label"])
.to_dict(orient="records")
for label in text.keys()
}
return text, style
|