Skip to content

edspdf.extractors.functional

get_blocs(layout)

Extract text blocs from a PDFMiner layout generator.

Arguments

layout: PDFMiner layout generator.

YIELDS DESCRIPTION
bloc

Text bloc

TYPE: Iterator[Tuple[LTTextBoxHorizontal, int, float, float]]

Source code in edspdf/extractors/functional.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
def get_blocs(
    layout: Iterator[LTPage],
) -> Iterator[Tuple[LTTextBoxHorizontal, int, float, float]]:
    """
    Extract text blocs from a PDFMiner layout generator.

    Arguments
    ---------
    layout:
        PDFMiner layout generator.

    Yields
    ------
    bloc :
        Text bloc
    """

    for i, page in enumerate(layout):

        width = page.width
        height = page.height

        for bloc in page:
            if isinstance(bloc, LTTextBoxHorizontal):
                yield bloc, i, width, height

get_lines(layout)

Extract lines from a PDFMiner layout object.

The line is reframed such that the origin is the top left corner.

PARAMETER DESCRIPTION
layout

PDFMiner layout object.

TYPE: Iterator[LTPage]

YIELDS DESCRIPTION
Iterator[Line]

Single line object.

Source code in edspdf/extractors/functional.py
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
def get_lines(layout: Iterator[LTPage]) -> Iterator[Line]:
    """
    Extract lines from a PDFMiner layout object.

    The line is reframed such that the origin is the top left corner.

    Parameters
    ----------
    layout : Iterator[LTPage]
        PDFMiner layout object.

    Yields
    -------
    Iterator[Line]
        Single line object.
    """
    for b, (bloc, p, w, h) in enumerate(get_blocs(layout)):
        for line in bloc:
            text, styles = extract_style(line, width=w, height=h)
            yield Line(
                page=p,
                bloc=b,
                x0=line.x0 / w,
                x1=line.x1 / w,
                y0=1 - line.y1 / h,
                y1=1 - line.y0 / h,
                page_width=w,
                page_height=h,
                text=text,
                styles=styles,
            )

remove_outside_lines(lines, strict_mode=False)

Filter out lines that are outside the canvas.

PARAMETER DESCRIPTION
lines

Dataframe of extracted lines

TYPE: pd.DataFrame

strict_mode

Whether to remove the line if any part of it is outside the canvas, by default False

TYPE: bool, optional DEFAULT: False

RETURNS DESCRIPTION
pd.DataFrame

Filtered lines.

Source code in edspdf/extractors/functional.py
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
def remove_outside_lines(
    lines: pd.DataFrame,
    strict_mode: bool = False,
) -> pd.DataFrame:
    """
    Filter out lines that are outside the canvas.

    Parameters
    ----------
    lines : pd.DataFrame
        Dataframe of extracted lines
    strict_mode : bool, optional
        Whether to remove the line if any part of it is outside the canvas,
        by default False

    Returns
    -------
    pd.DataFrame
        Filtered lines.
    """
    if strict_mode:
        lower = lines[["x0", "y0"]].min(axis=1) >= 0
        upper = lines[["x1", "y1"]].max(axis=1) <= 1
        lines = lines[lower & upper]
    else:
        below = lines[["x1", "y1"]].max(axis=1) < 0
        above = lines[["x0", "y0"]].min(axis=1) > 0
        lines = lines[~(below | above)]
    return lines