Skip to content

edspdf.extractors.pdfminer

PdfMinerExtractor

Bases: BaseExtractor

Extractor object. Given a PDF byte stream, produces a list of blocs.

PARAMETER DESCRIPTION
line_overlap

See PDFMiner documentation

TYPE: float DEFAULT: 0.5

char_margin

See PDFMiner documentation

TYPE: float DEFAULT: 2.0

line_margin

See PDFMiner documentation

TYPE: float DEFAULT: 0.5

word_margin

See PDFMiner documentation

TYPE: float DEFAULT: 0.1

boxes_flow

See PDFMiner documentation

TYPE: Optional[float] DEFAULT: 0.5

detect_vertical

See PDFMiner documentation

TYPE: bool DEFAULT: False

all_texts

See PDFMiner documentation

TYPE: bool DEFAULT: False

Source code in edspdf/extractors/pdfminer.py
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
@registry.extractors.register("pdfminer.v1")
class PdfMinerExtractor(BaseExtractor):
    """
    Extractor object. Given a PDF byte stream, produces a list of blocs.

    Parameters
    ----------
    line_overlap : float
        See PDFMiner documentation
    char_margin : float
        See PDFMiner documentation
    line_margin : float
        See PDFMiner documentation
    word_margin : float
        See PDFMiner documentation
    boxes_flow : Optional[float]
        See PDFMiner documentation
    detect_vertical : bool
        See PDFMiner documentation
    all_texts : bool
        See PDFMiner documentation
    """

    def __init__(
        self,
        line_overlap: float = 0.5,
        char_margin: float = 2.0,
        line_margin: float = 0.5,
        word_margin: float = 0.1,
        boxes_flow: Optional[float] = 0.5,
        detect_vertical: bool = False,
        all_texts: bool = False,
    ):

        self.laparams = LAParams(
            line_overlap=line_overlap,
            char_margin=char_margin,
            line_margin=line_margin,
            word_margin=word_margin,
            boxes_flow=boxes_flow,
            detect_vertical=detect_vertical,
            all_texts=all_texts,
        )

    def generate_lines(self, pdf: bytes) -> pd.DataFrame:
        """
        Generates dataframe from all blocs in the PDF.

        Arguments
        ---------
        pdf:
            Byte stream representing the PDF.

        Returns
        -------
        pd.DataFrame :
            DataFrame representing the blocs.
        """

        pdf_stream = BytesIO(pdf)

        layout = extract_pages(pdf_stream, laparams=self.laparams)
        lines = list(get_lines(layout))

        if not lines:
            return pd.DataFrame(
                columns=[
                    "page",
                    "bloc",
                    "x0",
                    "x1",
                    "y0",
                    "y1",
                    "page_width",
                    "page_height",
                    "text",
                    "styles",
                ]
            )

        df = pd.DataFrame.from_records([line.dict() for line in lines])
        df["line_id"] = range(len(df))

        return df

    def extract(self, pdf: bytes) -> pd.DataFrame:
        """
        Process a single PDF document.

        Parameters
        ----------
        pdf : bytes
            Raw byte representation of the PDF document.

        Returns
        -------
        pd.DataFrame
            DataFrame containing one row for each line extracted using PDFMiner.
        """

        lines = self.generate_lines(pdf)

        # Remove empty lines
        lines = lines[lines.text.str.len() > 0]

        # Remove lines that are outside the page
        lines = remove_outside_lines(lines, strict_mode=True)

        return lines

generate_lines(pdf)

Generates dataframe from all blocs in the PDF.

Arguments

pdf: Byte stream representing the PDF.

RETURNS DESCRIPTION
pd.DataFrame

DataFrame representing the blocs.

Source code in edspdf/extractors/pdfminer.py
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
def generate_lines(self, pdf: bytes) -> pd.DataFrame:
    """
    Generates dataframe from all blocs in the PDF.

    Arguments
    ---------
    pdf:
        Byte stream representing the PDF.

    Returns
    -------
    pd.DataFrame :
        DataFrame representing the blocs.
    """

    pdf_stream = BytesIO(pdf)

    layout = extract_pages(pdf_stream, laparams=self.laparams)
    lines = list(get_lines(layout))

    if not lines:
        return pd.DataFrame(
            columns=[
                "page",
                "bloc",
                "x0",
                "x1",
                "y0",
                "y1",
                "page_width",
                "page_height",
                "text",
                "styles",
            ]
        )

    df = pd.DataFrame.from_records([line.dict() for line in lines])
    df["line_id"] = range(len(df))

    return df

extract(pdf)

Process a single PDF document.

PARAMETER DESCRIPTION
pdf

Raw byte representation of the PDF document.

TYPE: bytes

RETURNS DESCRIPTION
pd.DataFrame

DataFrame containing one row for each line extracted using PDFMiner.

Source code in edspdf/extractors/pdfminer.py
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
def extract(self, pdf: bytes) -> pd.DataFrame:
    """
    Process a single PDF document.

    Parameters
    ----------
    pdf : bytes
        Raw byte representation of the PDF document.

    Returns
    -------
    pd.DataFrame
        DataFrame containing one row for each line extracted using PDFMiner.
    """

    lines = self.generate_lines(pdf)

    # Remove empty lines
    lines = lines[lines.text.str.len() > 0]

    # Remove lines that are outside the page
    lines = remove_outside_lines(lines, strict_mode=True)

    return lines