Skip to content

edspdf.readers.reader

PdfReader

Source code in edspdf/readers/reader.py
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
@registry.readers.register("pdf-reader.v1")
class PdfReader:
    def __init__(
        self,
        extractor: Optional[BaseExtractor] = None,
        classifier: Optional[BaseClassifier] = None,
        aggregator: Optional[BaseAggregator] = None,
        transform: Optional[BaseTransform] = None,
        meta_labels: Dict[str, str] = dict(),
    ) -> None:
        """
        Reads a text-based PDF document,

        Parameters
        ----------
        extractor : BaseExtractor
            Text bloc extractor.
        classifier : BaseClassifier
            Classifier model, to assign a section (eg `body`, `header`, etc).
        aggregator : BaseAggregator
            Aggregator model, to compile labelled text blocs together.
        transform : BaseTransform, optional
            Transformation to apply before classification.
        meta_labels : Dict[str, str], optional
            Dictionary of hierarchical labels
            (eg `table` is probably within the `body`).
        """

        self.extractor = extractor
        self.classifier = classifier
        self.aggregator = aggregator

        self.transform = transform
        self.meta_labels = meta_labels

    def predict(self, lines: pd.DataFrame) -> pd.DataFrame:
        """
        Predict the label of each text bloc.

        Parameters
        ----------
        lines : pd.DataFrame
            Text blocs to label.

        Returns
        -------
        pd.DataFrame
            Labelled text blocs.
        """

        lines["label"] = self.classifier.predict(lines)
        lines["meta_label"] = lines.label.replace(self.meta_labels)

        return lines

    def prepare_data(self, pdf: bytes, **context: Any) -> pd.DataFrame:
        """
        Prepare data before classification.
        Can also be used to generate the training dataset for the classifier.

        Parameters
        ----------
        pdf : bytes
            PDF document, as bytes.

        Returns
        -------
        pd.DataFrame
            Text blocs as a pandas DataFrame.
        """

        lines = self.extractor(pdf)

        for key, value in context.items():
            lines[key] = value

        # Apply transformation
        if self.transform is not None:
            lines = self.transform(lines)

        return lines

    def prepare_and_predict(self, pdf: bytes, **context: Any) -> pd.DataFrame:
        lines = self.prepare_data(pdf, **context)
        lines = self.predict(lines)
        return lines

    def __call__(
        self, pdf: bytes, **context: Any
    ) -> Union[Dict[str, str], Tuple[Dict[str, str], Dict[str, Any]]]:
        """
        Process the PDF document.

        Parameters
        ----------
        pdf : bytes
            Byte representation of the PDF document.

        context : Any
            Any contextual information that is used by the classifier
            (eg document type or source).

        Returns
        -------
        Dict[str, str]
            Dictionary containing the aggregated text.
        """
        lines = self.prepare_and_predict(pdf, **context)
        result = self.aggregator(lines)
        return result

__init__(extractor=None, classifier=None, aggregator=None, transform=None, meta_labels=dict())

Reads a text-based PDF document,

PARAMETER DESCRIPTION
extractor

Text bloc extractor.

TYPE: BaseExtractor DEFAULT: None

classifier

Classifier model, to assign a section (eg body, header, etc).

TYPE: BaseClassifier DEFAULT: None

aggregator

Aggregator model, to compile labelled text blocs together.

TYPE: BaseAggregator DEFAULT: None

transform

Transformation to apply before classification.

TYPE: BaseTransform, optional DEFAULT: None

meta_labels

Dictionary of hierarchical labels (eg table is probably within the body).

TYPE: Dict[str, str], optional DEFAULT: dict()

Source code in edspdf/readers/reader.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
def __init__(
    self,
    extractor: Optional[BaseExtractor] = None,
    classifier: Optional[BaseClassifier] = None,
    aggregator: Optional[BaseAggregator] = None,
    transform: Optional[BaseTransform] = None,
    meta_labels: Dict[str, str] = dict(),
) -> None:
    """
    Reads a text-based PDF document,

    Parameters
    ----------
    extractor : BaseExtractor
        Text bloc extractor.
    classifier : BaseClassifier
        Classifier model, to assign a section (eg `body`, `header`, etc).
    aggregator : BaseAggregator
        Aggregator model, to compile labelled text blocs together.
    transform : BaseTransform, optional
        Transformation to apply before classification.
    meta_labels : Dict[str, str], optional
        Dictionary of hierarchical labels
        (eg `table` is probably within the `body`).
    """

    self.extractor = extractor
    self.classifier = classifier
    self.aggregator = aggregator

    self.transform = transform
    self.meta_labels = meta_labels

predict(lines)

Predict the label of each text bloc.

PARAMETER DESCRIPTION
lines

Text blocs to label.

TYPE: pd.DataFrame

RETURNS DESCRIPTION
pd.DataFrame

Labelled text blocs.

Source code in edspdf/readers/reader.py
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
def predict(self, lines: pd.DataFrame) -> pd.DataFrame:
    """
    Predict the label of each text bloc.

    Parameters
    ----------
    lines : pd.DataFrame
        Text blocs to label.

    Returns
    -------
    pd.DataFrame
        Labelled text blocs.
    """

    lines["label"] = self.classifier.predict(lines)
    lines["meta_label"] = lines.label.replace(self.meta_labels)

    return lines

prepare_data(pdf, **context)

Prepare data before classification. Can also be used to generate the training dataset for the classifier.

PARAMETER DESCRIPTION
pdf

PDF document, as bytes.

TYPE: bytes

RETURNS DESCRIPTION
pd.DataFrame

Text blocs as a pandas DataFrame.

Source code in edspdf/readers/reader.py
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
def prepare_data(self, pdf: bytes, **context: Any) -> pd.DataFrame:
    """
    Prepare data before classification.
    Can also be used to generate the training dataset for the classifier.

    Parameters
    ----------
    pdf : bytes
        PDF document, as bytes.

    Returns
    -------
    pd.DataFrame
        Text blocs as a pandas DataFrame.
    """

    lines = self.extractor(pdf)

    for key, value in context.items():
        lines[key] = value

    # Apply transformation
    if self.transform is not None:
        lines = self.transform(lines)

    return lines

__call__(pdf, **context)

Process the PDF document.

PARAMETER DESCRIPTION
pdf

Byte representation of the PDF document.

TYPE: bytes

context : Any Any contextual information that is used by the classifier (eg document type or source).

RETURNS DESCRIPTION
Dict[str, str]

Dictionary containing the aggregated text.

Source code in edspdf/readers/reader.py
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
def __call__(
    self, pdf: bytes, **context: Any
) -> Union[Dict[str, str], Tuple[Dict[str, str], Dict[str, Any]]]:
    """
    Process the PDF document.

    Parameters
    ----------
    pdf : bytes
        Byte representation of the PDF document.

    context : Any
        Any contextual information that is used by the classifier
        (eg document type or source).

    Returns
    -------
    Dict[str, str]
        Dictionary containing the aggregated text.
    """
    lines = self.prepare_and_predict(pdf, **context)
    result = self.aggregator(lines)
    return result