`edspdf.extractors.functional`

`get_blocs(layout)`

Extract text blocs from a PDFMiner layout generator.

Arguments

layout: PDFMiner layout generator.

YIELDS DESCRIPTION

bloc

Text bloc

TYPE: Iterator[Tuple[LTTextBoxHorizontal, int, float, float]]

Source code in edspdf/extractors/functional.py

def get_blocs(
    layout: Iterator[LTPage],
) -> Iterator[Tuple[LTTextBoxHorizontal, int, float, float]]:
    """
    Extract text blocs from a PDFMiner layout generator.

    Arguments
    ---------
    layout:
        PDFMiner layout generator.

    Yields
    ------
    bloc :
        Text bloc
    """

    for i, page in enumerate(layout):

        width = page.width
        height = page.height

        for bloc in page:
            if isinstance(bloc, LTTextBoxHorizontal):
                yield bloc, i, width, height

`get_lines(layout)`

Extract lines from a PDFMiner layout object.

The line is reframed such that the origin is the top left corner.

PARAMETER DESCRIPTION

layout

PDFMiner layout object.

TYPE: Iterator[LTPage]

YIELDS	DESCRIPTION
`Iterator[Line]`	Single line object.

Source code in edspdf/extractors/functional.py

def get_lines(layout: Iterator[LTPage]) -> Iterator[Line]:
    """
    Extract lines from a PDFMiner layout object.

    The line is reframed such that the origin is the top left corner.

    Parameters
    ----------
    layout : Iterator[LTPage]
        PDFMiner layout object.

    Yields
    -------
    Iterator[Line]
        Single line object.
    """
    for b, (bloc, p, w, h) in enumerate(get_blocs(layout)):
        for line in bloc:
            text, styles = extract_style(line, width=w, height=h)
            yield Line(
                page=p,
                bloc=b,
                x0=line.x0 / w,
                x1=line.x1 / w,
                y0=1 - line.y1 / h,
                y1=1 - line.y0 / h,
                page_width=w,
                page_height=h,
                text=text,
                styles=styles,
            )

`remove_outside_lines(lines, strict_mode=False)`

Filter out lines that are outside the canvas.

PARAMETER DESCRIPTION

lines

Dataframe of extracted lines

TYPE: pd.DataFrame

strict_mode

Whether to remove the line if any part of it is outside the canvas, by default False

TYPE: bool, optional DEFAULT: False

RETURNS	DESCRIPTION
`pd.DataFrame`	Filtered lines.

Source code in edspdf/extractors/functional.py

def remove_outside_lines(
    lines: pd.DataFrame,
    strict_mode: bool = False,
) -> pd.DataFrame:
    """
    Filter out lines that are outside the canvas.

    Parameters
    ----------
    lines : pd.DataFrame
        Dataframe of extracted lines
    strict_mode : bool, optional
        Whether to remove the line if any part of it is outside the canvas,
        by default False

    Returns
    -------
    pd.DataFrame
        Filtered lines.
    """
    if strict_mode:
        lower = lines[["x0", "y0"]].min(axis=1) >= 0
        upper = lines[["x1", "y1"]].max(axis=1) <= 1
        lines = lines[lower & upper]
    else:
        below = lines[["x1", "y1"]].max(axis=1) < 0
        above = lines[["x0", "y0"]].min(axis=1) > 0
        lines = lines[~(below | above)]
    return lines