`edspdf.components.extractors.pdfminer`

`PdfMinerExtractor`

Bases: Component

Source code in edspdf/components/extractors/pdfminer.py

@registry.factory.register("pdfminer-extractor")
class PdfMinerExtractor(Component):
    def __init__(
        self,
        line_overlap: float = 0.5,
        char_margin: float = 2.05,
        line_margin: float = 0.5,
        word_margin: float = 0.1,
        boxes_flow: Optional[float] = 0.5,
        detect_vertical: bool = False,
        all_texts: bool = False,
        extract_style: bool = False,
        raise_on_error: bool = False,
    ):
        """
        Extractor object. Given a PDF byte stream, produces a list of elements.

        Parameters
        ----------
        line_overlap : float
            See PDFMiner documentation
        char_margin : float
            See PDFMiner documentation
        line_margin : float
            See PDFMiner documentation
        word_margin : float
            See PDFMiner documentation
        boxes_flow : Optional[float]
            See PDFMiner documentation
        detect_vertical : bool
            See PDFMiner documentation
        all_texts : bool
            See PDFMiner documentation
        extract_style : bool
            Whether to extract style (font, size, ...) information for each line of
            the document.
            Default: False
        """

        super().__init__()

        self.laparams = LAParams(
            line_overlap=line_overlap,
            char_margin=char_margin,
            line_margin=line_margin,
            word_margin=word_margin,
            boxes_flow=boxes_flow,
            detect_vertical=detect_vertical,
            all_texts=all_texts,
        )
        self.extract_style = extract_style
        self.raise_on_error = raise_on_error

    def __call__(self, doc: Union[PDFDoc, bytes]) -> PDFDoc:
        """
        Extract blocks from a PDF from all blocks in the PDF.

        Arguments
        ---------
        doc:
            PDF document

        Returns
        -------
        PDFDoc:
            PDF document
        """

        if not isinstance(doc, PDFDoc):
            content = bytes(doc)
            doc = PDFDoc(id=str(hash(content)), content=content)
        content = doc.content
        content_stream = BytesIO(content)

        try:
            layout = list(extract_pages(content_stream, laparams=self.laparams))
        except PDFException:
            if self.raise_on_error:
                raise
            doc.lines = []
            doc.error = True
            return doc

        lines = []

        page_count = 0
        for page_no, page in enumerate(layout):

            page_count += 1

            w = page.width
            h = page.height

            for bloc in page:
                if not isinstance(bloc, LTTextBoxHorizontal):
                    continue
                bloc: LTTextBoxHorizontal

                for line in bloc:
                    text, styles = extract_style_from_line(line)
                    if len(text) == 0:
                        continue
                    lines.append(
                        TextBox(
                            page=page_no,
                            x0=line.x0 / w,
                            x1=line.x1 / w,
                            y0=1 - line.y1 / h,
                            y1=1 - line.y0 / h,
                            page_width=w,
                            page_height=h,
                            text=text,
                            styles=styles if self.extract_style else (),
                        )
                    )

        doc.lines = sorted(
            [
                line
                for line in lines
                if line.x0 >= 0 and line.y0 >= 0 and line.x1 <= 1 and line.y1 <= 1
            ]
        )

        return doc

`init(line_overlap=0.5, char_margin=2.05, line_margin=0.5, word_margin=0.1, boxes_flow=0.5, detect_vertical=False, all_texts=False, extract_style=False, raise_on_error=False)`

Extractor object. Given a PDF byte stream, produces a list of elements.

PARAMETER	DESCRIPTION
`line_overlap`	See PDFMiner documentation TYPE: `float` DEFAULT: `0.5`
`char_margin`	See PDFMiner documentation TYPE: `float` DEFAULT: `2.05`
`line_margin`	See PDFMiner documentation TYPE: `float` DEFAULT: `0.5`
`word_margin`	See PDFMiner documentation TYPE: `float` DEFAULT: `0.1`
`boxes_flow`	See PDFMiner documentation TYPE: `Optional[float]` DEFAULT: `0.5`
`detect_vertical`	See PDFMiner documentation TYPE: `bool` DEFAULT: `False`
`all_texts`	See PDFMiner documentation TYPE: `bool` DEFAULT: `False`
`extract_style`	Whether to extract style (font, size, ...) information for each line of the document. Default: False TYPE: `bool` DEFAULT: `False`

Source code in edspdf/components/extractors/pdfminer.py

def __init__(
    self,
    line_overlap: float = 0.5,
    char_margin: float = 2.05,
    line_margin: float = 0.5,
    word_margin: float = 0.1,
    boxes_flow: Optional[float] = 0.5,
    detect_vertical: bool = False,
    all_texts: bool = False,
    extract_style: bool = False,
    raise_on_error: bool = False,
):
    """
    Extractor object. Given a PDF byte stream, produces a list of elements.

    Parameters
    ----------
    line_overlap : float
        See PDFMiner documentation
    char_margin : float
        See PDFMiner documentation
    line_margin : float
        See PDFMiner documentation
    word_margin : float
        See PDFMiner documentation
    boxes_flow : Optional[float]
        See PDFMiner documentation
    detect_vertical : bool
        See PDFMiner documentation
    all_texts : bool
        See PDFMiner documentation
    extract_style : bool
        Whether to extract style (font, size, ...) information for each line of
        the document.
        Default: False
    """

    super().__init__()

    self.laparams = LAParams(
        line_overlap=line_overlap,
        char_margin=char_margin,
        line_margin=line_margin,
        word_margin=word_margin,
        boxes_flow=boxes_flow,
        detect_vertical=detect_vertical,
        all_texts=all_texts,
    )
    self.extract_style = extract_style
    self.raise_on_error = raise_on_error

`call(doc)`

Extract blocks from a PDF from all blocks in the PDF.

Arguments

doc: PDF document

RETURNS DESCRIPTION

PDFDoc

PDF document

TYPE: PDFDoc

Source code in edspdf/components/extractors/pdfminer.py

def __call__(self, doc: Union[PDFDoc, bytes]) -> PDFDoc:
    """
    Extract blocks from a PDF from all blocks in the PDF.

    Arguments
    ---------
    doc:
        PDF document

    Returns
    -------
    PDFDoc:
        PDF document
    """

    if not isinstance(doc, PDFDoc):
        content = bytes(doc)
        doc = PDFDoc(id=str(hash(content)), content=content)
    content = doc.content
    content_stream = BytesIO(content)

    try:
        layout = list(extract_pages(content_stream, laparams=self.laparams))
    except PDFException:
        if self.raise_on_error:
            raise
        doc.lines = []
        doc.error = True
        return doc

    lines = []

    page_count = 0
    for page_no, page in enumerate(layout):

        page_count += 1

        w = page.width
        h = page.height

        for bloc in page:
            if not isinstance(bloc, LTTextBoxHorizontal):
                continue
            bloc: LTTextBoxHorizontal

            for line in bloc:
                text, styles = extract_style_from_line(line)
                if len(text) == 0:
                    continue
                lines.append(
                    TextBox(
                        page=page_no,
                        x0=line.x0 / w,
                        x1=line.x1 / w,
                        y0=1 - line.y1 / h,
                        y1=1 - line.y0 / h,
                        page_width=w,
                        page_height=h,
                        text=text,
                        styles=styles if self.extract_style else (),
                    )
                )

    doc.lines = sorted(
        [
            line
            for line in lines
            if line.x0 >= 0 and line.y0 >= 0 and line.x1 <= 1 and line.y1 <= 1
        ]
    )

    return doc

edspdf.components.extractors.pdfminer

PdfMinerExtractor

__init__(line_overlap=0.5, char_margin=2.05, line_margin=0.5, word_margin=0.1, boxes_flow=0.5, detect_vertical=False, all_texts=False, extract_style=False, raise_on_error=False)

__call__(doc)

Arguments

`edspdf.components.extractors.pdfminer`

`PdfMinerExtractor`

`init(line_overlap=0.5, char_margin=2.05, line_margin=0.5, word_margin=0.1, boxes_flow=0.5, detect_vertical=False, all_texts=False, extract_style=False, raise_on_error=False)`

`call(doc)`