Skip to content

edspdf.components.extractors.pdfminer

PdfMinerExtractor

Bases: Component

Source code in edspdf/components/extractors/pdfminer.py
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
@registry.factory.register("pdfminer-extractor")
class PdfMinerExtractor(Component):
    def __init__(
        self,
        line_overlap: float = 0.5,
        char_margin: float = 2.05,
        line_margin: float = 0.5,
        word_margin: float = 0.1,
        boxes_flow: Optional[float] = 0.5,
        detect_vertical: bool = False,
        all_texts: bool = False,
        extract_style: bool = False,
        raise_on_error: bool = False,
    ):
        """
        Extractor object. Given a PDF byte stream, produces a list of elements.

        Parameters
        ----------
        line_overlap : float
            See PDFMiner documentation
        char_margin : float
            See PDFMiner documentation
        line_margin : float
            See PDFMiner documentation
        word_margin : float
            See PDFMiner documentation
        boxes_flow : Optional[float]
            See PDFMiner documentation
        detect_vertical : bool
            See PDFMiner documentation
        all_texts : bool
            See PDFMiner documentation
        extract_style : bool
            Whether to extract style (font, size, ...) information for each line of
            the document.
            Default: False
        """

        super().__init__()

        self.laparams = LAParams(
            line_overlap=line_overlap,
            char_margin=char_margin,
            line_margin=line_margin,
            word_margin=word_margin,
            boxes_flow=boxes_flow,
            detect_vertical=detect_vertical,
            all_texts=all_texts,
        )
        self.extract_style = extract_style
        self.raise_on_error = raise_on_error

    def __call__(self, doc: Union[PDFDoc, bytes]) -> PDFDoc:
        """
        Extract blocks from a PDF from all blocks in the PDF.

        Arguments
        ---------
        doc:
            PDF document

        Returns
        -------
        PDFDoc:
            PDF document
        """

        if not isinstance(doc, PDFDoc):
            content = bytes(doc)
            doc = PDFDoc(id=str(hash(content)), content=content)
        content = doc.content
        content_stream = BytesIO(content)

        try:
            layout = list(extract_pages(content_stream, laparams=self.laparams))
        except PDFException:
            if self.raise_on_error:
                raise
            doc.lines = []
            doc.error = True
            return doc

        lines = []

        page_count = 0
        for page_no, page in enumerate(layout):

            page_count += 1

            w = page.width
            h = page.height

            for bloc in page:
                if not isinstance(bloc, LTTextBoxHorizontal):
                    continue
                bloc: LTTextBoxHorizontal

                for line in bloc:
                    text, styles = extract_style_from_line(line)
                    if len(text) == 0:
                        continue
                    lines.append(
                        TextBox(
                            page=page_no,
                            x0=line.x0 / w,
                            x1=line.x1 / w,
                            y0=1 - line.y1 / h,
                            y1=1 - line.y0 / h,
                            page_width=w,
                            page_height=h,
                            text=text,
                            styles=styles if self.extract_style else (),
                        )
                    )

        doc.lines = sorted(
            [
                line
                for line in lines
                if line.x0 >= 0 and line.y0 >= 0 and line.x1 <= 1 and line.y1 <= 1
            ]
        )

        return doc

__init__(line_overlap=0.5, char_margin=2.05, line_margin=0.5, word_margin=0.1, boxes_flow=0.5, detect_vertical=False, all_texts=False, extract_style=False, raise_on_error=False)

Extractor object. Given a PDF byte stream, produces a list of elements.

PARAMETER DESCRIPTION
line_overlap

See PDFMiner documentation

TYPE: float DEFAULT: 0.5

char_margin

See PDFMiner documentation

TYPE: float DEFAULT: 2.05

line_margin

See PDFMiner documentation

TYPE: float DEFAULT: 0.5

word_margin

See PDFMiner documentation

TYPE: float DEFAULT: 0.1

boxes_flow

See PDFMiner documentation

TYPE: Optional[float] DEFAULT: 0.5

detect_vertical

See PDFMiner documentation

TYPE: bool DEFAULT: False

all_texts

See PDFMiner documentation

TYPE: bool DEFAULT: False

extract_style

Whether to extract style (font, size, ...) information for each line of the document. Default: False

TYPE: bool DEFAULT: False

Source code in edspdf/components/extractors/pdfminer.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
def __init__(
    self,
    line_overlap: float = 0.5,
    char_margin: float = 2.05,
    line_margin: float = 0.5,
    word_margin: float = 0.1,
    boxes_flow: Optional[float] = 0.5,
    detect_vertical: bool = False,
    all_texts: bool = False,
    extract_style: bool = False,
    raise_on_error: bool = False,
):
    """
    Extractor object. Given a PDF byte stream, produces a list of elements.

    Parameters
    ----------
    line_overlap : float
        See PDFMiner documentation
    char_margin : float
        See PDFMiner documentation
    line_margin : float
        See PDFMiner documentation
    word_margin : float
        See PDFMiner documentation
    boxes_flow : Optional[float]
        See PDFMiner documentation
    detect_vertical : bool
        See PDFMiner documentation
    all_texts : bool
        See PDFMiner documentation
    extract_style : bool
        Whether to extract style (font, size, ...) information for each line of
        the document.
        Default: False
    """

    super().__init__()

    self.laparams = LAParams(
        line_overlap=line_overlap,
        char_margin=char_margin,
        line_margin=line_margin,
        word_margin=word_margin,
        boxes_flow=boxes_flow,
        detect_vertical=detect_vertical,
        all_texts=all_texts,
    )
    self.extract_style = extract_style
    self.raise_on_error = raise_on_error

__call__(doc)

Extract blocks from a PDF from all blocks in the PDF.

Arguments

doc: PDF document

RETURNS DESCRIPTION
PDFDoc

PDF document

TYPE: PDFDoc

Source code in edspdf/components/extractors/pdfminer.py
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
def __call__(self, doc: Union[PDFDoc, bytes]) -> PDFDoc:
    """
    Extract blocks from a PDF from all blocks in the PDF.

    Arguments
    ---------
    doc:
        PDF document

    Returns
    -------
    PDFDoc:
        PDF document
    """

    if not isinstance(doc, PDFDoc):
        content = bytes(doc)
        doc = PDFDoc(id=str(hash(content)), content=content)
    content = doc.content
    content_stream = BytesIO(content)

    try:
        layout = list(extract_pages(content_stream, laparams=self.laparams))
    except PDFException:
        if self.raise_on_error:
            raise
        doc.lines = []
        doc.error = True
        return doc

    lines = []

    page_count = 0
    for page_no, page in enumerate(layout):

        page_count += 1

        w = page.width
        h = page.height

        for bloc in page:
            if not isinstance(bloc, LTTextBoxHorizontal):
                continue
            bloc: LTTextBoxHorizontal

            for line in bloc:
                text, styles = extract_style_from_line(line)
                if len(text) == 0:
                    continue
                lines.append(
                    TextBox(
                        page=page_no,
                        x0=line.x0 / w,
                        x1=line.x1 / w,
                        y0=1 - line.y1 / h,
                        y1=1 - line.y0 / h,
                        page_width=w,
                        page_height=h,
                        text=text,
                        styles=styles if self.extract_style else (),
                    )
                )

    doc.lines = sorted(
        [
            line
            for line in lines
            if line.x0 >= 0 and line.y0 >= 0 and line.x1 <= 1 and line.y1 <= 1
        ]
    )

    return doc