`edspdf.readers.reader`

`PdfReader`

Source code in edspdf/readers/reader.py

@registry.readers.register("pdf-reader.v1")
class PdfReader:
    def __init__(
        self,
        extractor: Optional[BaseExtractor] = None,
        classifier: Optional[BaseClassifier] = None,
        aggregator: Optional[BaseAggregator] = None,
        transform: Optional[BaseTransform] = None,
        meta_labels: Dict[str, str] = dict(),
    ) -> None:
        """
        Reads a text-based PDF document,

        Parameters
        ----------
        extractor : BaseExtractor
            Text bloc extractor.
        classifier : BaseClassifier
            Classifier model, to assign a section (eg `body`, `header`, etc).
        aggregator : BaseAggregator
            Aggregator model, to compile labelled text blocs together.
        transform : BaseTransform, optional
            Transformation to apply before classification.
        meta_labels : Dict[str, str], optional
            Dictionary of hierarchical labels
            (eg `table` is probably within the `body`).
        """

        self.extractor = extractor
        self.classifier = classifier
        self.aggregator = aggregator

        self.transform = transform
        self.meta_labels = meta_labels

    def predict(self, lines: pd.DataFrame) -> pd.DataFrame:
        """
        Predict the label of each text bloc.

        Parameters
        ----------
        lines : pd.DataFrame
            Text blocs to label.

        Returns
        -------
        pd.DataFrame
            Labelled text blocs.
        """

        lines["label"] = self.classifier.predict(lines)
        lines["meta_label"] = lines.label.replace(self.meta_labels)

        return lines

    def prepare_data(self, pdf: bytes, **context: Any) -> pd.DataFrame:
        """
        Prepare data before classification.
        Can also be used to generate the training dataset for the classifier.

        Parameters
        ----------
        pdf : bytes
            PDF document, as bytes.

        Returns
        -------
        pd.DataFrame
            Text blocs as a pandas DataFrame.
        """

        lines = self.extractor(pdf)

        for key, value in context.items():
            lines[key] = value

        # Apply transformation
        if self.transform is not None:
            lines = self.transform(lines)

        return lines

    def prepare_and_predict(self, pdf: bytes, **context: Any) -> pd.DataFrame:
        lines = self.prepare_data(pdf, **context)
        lines = self.predict(lines)
        return lines

    def __call__(
        self, pdf: bytes, **context: Any
    ) -> Union[Dict[str, str], Tuple[Dict[str, str], Dict[str, Any]]]:
        """
        Process the PDF document.

        Parameters
        ----------
        pdf : bytes
            Byte representation of the PDF document.

        context : Any
            Any contextual information that is used by the classifier
            (eg document type or source).

        Returns
        -------
        Dict[str, str]
            Dictionary containing the aggregated text.
        """
        lines = self.prepare_and_predict(pdf, **context)
        result = self.aggregator(lines)
        return result

`init(extractor=None, classifier=None, aggregator=None, transform=None, meta_labels=dict())`

Reads a text-based PDF document,

PARAMETER	DESCRIPTION
`extractor`	Text bloc extractor. TYPE: `BaseExtractor` DEFAULT: `None`
`classifier`	Classifier model, to assign a section (eg `body`, `header`, etc). TYPE: `BaseClassifier` DEFAULT: `None`
`aggregator`	Aggregator model, to compile labelled text blocs together. TYPE: `BaseAggregator` DEFAULT: `None`
`transform`	Transformation to apply before classification. TYPE: `BaseTransform, optional` DEFAULT: `None`
`meta_labels`	Dictionary of hierarchical labels (eg `table` is probably within the `body`). TYPE: `Dict[str, str], optional` DEFAULT: `dict()`

Source code in edspdf/readers/reader.py

def __init__(
    self,
    extractor: Optional[BaseExtractor] = None,
    classifier: Optional[BaseClassifier] = None,
    aggregator: Optional[BaseAggregator] = None,
    transform: Optional[BaseTransform] = None,
    meta_labels: Dict[str, str] = dict(),
) -> None:
    """
    Reads a text-based PDF document,

    Parameters
    ----------
    extractor : BaseExtractor
        Text bloc extractor.
    classifier : BaseClassifier
        Classifier model, to assign a section (eg `body`, `header`, etc).
    aggregator : BaseAggregator
        Aggregator model, to compile labelled text blocs together.
    transform : BaseTransform, optional
        Transformation to apply before classification.
    meta_labels : Dict[str, str], optional
        Dictionary of hierarchical labels
        (eg `table` is probably within the `body`).
    """

    self.extractor = extractor
    self.classifier = classifier
    self.aggregator = aggregator

    self.transform = transform
    self.meta_labels = meta_labels

`predict(lines)`

Predict the label of each text bloc.

PARAMETER DESCRIPTION

lines

Text blocs to label.

TYPE: pd.DataFrame

RETURNS	DESCRIPTION
`pd.DataFrame`	Labelled text blocs.

Source code in edspdf/readers/reader.py

def predict(self, lines: pd.DataFrame) -> pd.DataFrame:
    """
    Predict the label of each text bloc.

    Parameters
    ----------
    lines : pd.DataFrame
        Text blocs to label.

    Returns
    -------
    pd.DataFrame
        Labelled text blocs.
    """

    lines["label"] = self.classifier.predict(lines)
    lines["meta_label"] = lines.label.replace(self.meta_labels)

    return lines

`prepare_data(pdf, **context)`

Prepare data before classification. Can also be used to generate the training dataset for the classifier.

PARAMETER DESCRIPTION

pdf

PDF document, as bytes.

TYPE: bytes

RETURNS	DESCRIPTION
`pd.DataFrame`	Text blocs as a pandas DataFrame.

Source code in edspdf/readers/reader.py

def prepare_data(self, pdf: bytes, **context: Any) -> pd.DataFrame:
    """
    Prepare data before classification.
    Can also be used to generate the training dataset for the classifier.

    Parameters
    ----------
    pdf : bytes
        PDF document, as bytes.

    Returns
    -------
    pd.DataFrame
        Text blocs as a pandas DataFrame.
    """

    lines = self.extractor(pdf)

    for key, value in context.items():
        lines[key] = value

    # Apply transformation
    if self.transform is not None:
        lines = self.transform(lines)

    return lines

`call(pdf, **context)`

Process the PDF document.

PARAMETER DESCRIPTION

pdf

Byte representation of the PDF document.

TYPE: bytes

context : Any Any contextual information that is used by the classifier (eg document type or source).

RETURNS	DESCRIPTION
`Dict[str, str]`	Dictionary containing the aggregated text.

Source code in edspdf/readers/reader.py

def __call__(
    self, pdf: bytes, **context: Any
) -> Union[Dict[str, str], Tuple[Dict[str, str], Dict[str, Any]]]:
    """
    Process the PDF document.

    Parameters
    ----------
    pdf : bytes
        Byte representation of the PDF document.

    context : Any
        Any contextual information that is used by the classifier
        (eg document type or source).

    Returns
    -------
    Dict[str, str]
        Dictionary containing the aggregated text.
    """
    lines = self.prepare_and_predict(pdf, **context)
    result = self.aggregator(lines)
    return result

edspdf.readers.reader

PdfReader

__init__(extractor=None, classifier=None, aggregator=None, transform=None, meta_labels=dict())

predict(lines)

prepare_data(pdf, **context)

__call__(pdf, **context)

`edspdf.readers.reader`

`PdfReader`

`init(extractor=None, classifier=None, aggregator=None, transform=None, meta_labels=dict())`

`predict(lines)`

`prepare_data(pdf, **context)`

`call(pdf, **context)`