Skip to content

edspdf

loading

load(path)

Load a complete pipeline.

TODO: implement other ways to load a pipeline.

PARAMETER DESCRIPTION
path

Path to the pipeline.

TYPE: Path

RETURNS DESCRIPTION
PdfReader

A PdfReader object.

Source code in edspdf/loading.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
def load(path: Path) -> PdfReader:
    """
    Load a complete pipeline.

    TODO: implement other ways to load a pipeline.

    Parameters
    ----------
    path : Path
        Path to the pipeline.

    Returns
    -------
    PdfReader
        A PdfReader object.
    """
    conf = Config().from_disk(path)
    return registry.resolve(conf)["reader"]

from_str(config)

Load a complete pipeline from a string config.

PARAMETER DESCRIPTION
config

Configuration.

TYPE: str

RETURNS DESCRIPTION
PdfReader

A PdfReader object.

Source code in edspdf/loading.py
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
def from_str(config: str) -> PdfReader:
    """
    Load a complete pipeline from a string config.

    Parameters
    ----------
    config : str
        Configuration.

    Returns
    -------
    PdfReader
        A PdfReader object.
    """
    conf = Config().from_str(config)
    return registry.resolve(conf)["reader"]

aggregators

styled

StyledAggregator

Bases: SimpleAggregator

Aggregator that returns text and styles.

Source code in edspdf/aggregators/styled.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
@registry.aggregators.register("styled.v1")
class StyledAggregator(SimpleAggregator):
    """
    Aggregator that returns text and styles.
    """

    def aggregate(
        self, lines: pd.DataFrame
    ) -> Tuple[Dict[str, str], Dict[str, List[Dict]]]:

        lines = lines.sort_values(["page", "y1", "x0"])

        lines["line_id"] = range(len(lines))

        styles = lines[["line_id", "styles"]].explode("styles").dropna().reset_index()
        styles = styles[["line_id"]].join(pd.json_normalize(styles.styles))

        lines = prepare_newlines(
            lines,
            nl_threshold=self.nl_threshold,
            np_threshold=self.np_threshold,
        )

        lines["offset"] = lines["text_with_newline"].str.len()
        lines["offset"] = lines.groupby(["label"])["offset"].transform("cumsum")
        lines["offset"] = lines.groupby(["label"])["offset"].transform("shift")
        lines["offset"] = lines["offset"].fillna(0).astype(int)

        styles = styles.merge(lines[["line_id", "offset", "label"]], on="line_id")
        styles["start"] += styles.offset
        styles["end"] += styles.offset

        df = lines.groupby(["label"]).agg(text=("text_with_newline", "sum"))

        text = df.text.to_dict()
        style = {
            label: styles.query("label == @label")
            .drop(columns=["line_id", "offset", "label"])
            .to_dict(orient="records")
            for label in text.keys()
        }

        return text, style

base

BaseAggregator

Bases: ABC

Source code in edspdf/aggregators/base.py
 7
 8
 9
10
11
12
13
14
15
16
17
class BaseAggregator(ABC):
    @abstractmethod
    def aggregate(self, lines: pd.DataFrame) -> Dict[str, str]:
        """
        Handles the text aggregation
        """

    def __call__(self, lines: pd.DataFrame, copy: bool = False) -> Dict[str, str]:
        if copy:
            lines = lines.copy()
        return self.aggregate(lines)
aggregate(lines) abstractmethod

Handles the text aggregation

Source code in edspdf/aggregators/base.py
 8
 9
10
11
12
@abstractmethod
def aggregate(self, lines: pd.DataFrame) -> Dict[str, str]:
    """
    Handles the text aggregation
    """

transforms

base

BaseTransform

Bases: ABC

Source code in edspdf/transforms/base.py
 6
 7
 8
 9
10
11
12
13
14
class BaseTransform(ABC):
    @abstractmethod
    def transform(self, lines: pd.DataFrame) -> pd.DataFrame:
        """
        Handles the transformation
        """

    def __call__(self, lines: pd.DataFrame) -> pd.DataFrame:
        return self.transform(lines)
transform(lines) abstractmethod

Handles the transformation

Source code in edspdf/transforms/base.py
 7
 8
 9
10
11
@abstractmethod
def transform(self, lines: pd.DataFrame) -> pd.DataFrame:
    """
    Handles the transformation
    """

readers

reader

PdfReader

Source code in edspdf/readers/reader.py
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
@registry.readers.register("pdf-reader.v1")
class PdfReader:
    def __init__(
        self,
        extractor: Optional[BaseExtractor] = None,
        classifier: Optional[BaseClassifier] = None,
        aggregator: Optional[BaseAggregator] = None,
        transform: Optional[BaseTransform] = None,
        meta_labels: Dict[str, str] = dict(),
    ) -> None:
        """
        Reads a text-based PDF document,

        Parameters
        ----------
        extractor : BaseExtractor
            Text bloc extractor.
        classifier : BaseClassifier
            Classifier model, to assign a section (eg `body`, `header`, etc).
        aggregator : BaseAggregator
            Aggregator model, to compile labelled text blocs together.
        transform : BaseTransform, optional
            Transformation to apply before classification.
        meta_labels : Dict[str, str], optional
            Dictionary of hierarchical labels
            (eg `table` is probably within the `body`).
        """

        self.extractor = extractor
        self.classifier = classifier
        self.aggregator = aggregator

        self.transform = transform
        self.meta_labels = meta_labels

    def predict(self, lines: pd.DataFrame) -> pd.DataFrame:
        """
        Predict the label of each text bloc.

        Parameters
        ----------
        lines : pd.DataFrame
            Text blocs to label.

        Returns
        -------
        pd.DataFrame
            Labelled text blocs.
        """

        lines["label"] = self.classifier.predict(lines)
        lines["meta_label"] = lines.label.replace(self.meta_labels)

        return lines

    def prepare_data(self, pdf: bytes, **context: Any) -> pd.DataFrame:
        """
        Prepare data before classification.
        Can also be used to generate the training dataset for the classifier.

        Parameters
        ----------
        pdf : bytes
            PDF document, as bytes.

        Returns
        -------
        pd.DataFrame
            Text blocs as a pandas DataFrame.
        """

        lines = self.extractor(pdf)

        for key, value in context.items():
            lines[key] = value

        # Apply transformation
        if self.transform is not None:
            lines = self.transform(lines)

        return lines

    def prepare_and_predict(self, pdf: bytes, **context: Any) -> pd.DataFrame:
        lines = self.prepare_data(pdf, **context)
        lines = self.predict(lines)
        return lines

    def __call__(
        self, pdf: bytes, **context: Any
    ) -> Union[Dict[str, str], Tuple[Dict[str, str], Dict[str, Any]]]:
        """
        Process the PDF document.

        Parameters
        ----------
        pdf : bytes
            Byte representation of the PDF document.

        context : Any
            Any contextual information that is used by the classifier
            (eg document type or source).

        Returns
        -------
        Dict[str, str]
            Dictionary containing the aggregated text.
        """
        lines = self.prepare_and_predict(pdf, **context)
        result = self.aggregator(lines)
        return result
__init__(extractor=None, classifier=None, aggregator=None, transform=None, meta_labels=dict())

Reads a text-based PDF document,

PARAMETER DESCRIPTION
extractor

Text bloc extractor.

TYPE: BaseExtractor DEFAULT: None

classifier

Classifier model, to assign a section (eg body, header, etc).

TYPE: BaseClassifier DEFAULT: None

aggregator

Aggregator model, to compile labelled text blocs together.

TYPE: BaseAggregator DEFAULT: None

transform

Transformation to apply before classification.

TYPE: BaseTransform, optional DEFAULT: None

meta_labels

Dictionary of hierarchical labels (eg table is probably within the body).

TYPE: Dict[str, str], optional DEFAULT: dict()

Source code in edspdf/readers/reader.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
def __init__(
    self,
    extractor: Optional[BaseExtractor] = None,
    classifier: Optional[BaseClassifier] = None,
    aggregator: Optional[BaseAggregator] = None,
    transform: Optional[BaseTransform] = None,
    meta_labels: Dict[str, str] = dict(),
) -> None:
    """
    Reads a text-based PDF document,

    Parameters
    ----------
    extractor : BaseExtractor
        Text bloc extractor.
    classifier : BaseClassifier
        Classifier model, to assign a section (eg `body`, `header`, etc).
    aggregator : BaseAggregator
        Aggregator model, to compile labelled text blocs together.
    transform : BaseTransform, optional
        Transformation to apply before classification.
    meta_labels : Dict[str, str], optional
        Dictionary of hierarchical labels
        (eg `table` is probably within the `body`).
    """

    self.extractor = extractor
    self.classifier = classifier
    self.aggregator = aggregator

    self.transform = transform
    self.meta_labels = meta_labels
predict(lines)

Predict the label of each text bloc.

PARAMETER DESCRIPTION
lines

Text blocs to label.

TYPE: pd.DataFrame

RETURNS DESCRIPTION
pd.DataFrame

Labelled text blocs.

Source code in edspdf/readers/reader.py
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
def predict(self, lines: pd.DataFrame) -> pd.DataFrame:
    """
    Predict the label of each text bloc.

    Parameters
    ----------
    lines : pd.DataFrame
        Text blocs to label.

    Returns
    -------
    pd.DataFrame
        Labelled text blocs.
    """

    lines["label"] = self.classifier.predict(lines)
    lines["meta_label"] = lines.label.replace(self.meta_labels)

    return lines
prepare_data(pdf, **context)

Prepare data before classification. Can also be used to generate the training dataset for the classifier.

PARAMETER DESCRIPTION
pdf

PDF document, as bytes.

TYPE: bytes

RETURNS DESCRIPTION
pd.DataFrame

Text blocs as a pandas DataFrame.

Source code in edspdf/readers/reader.py
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
def prepare_data(self, pdf: bytes, **context: Any) -> pd.DataFrame:
    """
    Prepare data before classification.
    Can also be used to generate the training dataset for the classifier.

    Parameters
    ----------
    pdf : bytes
        PDF document, as bytes.

    Returns
    -------
    pd.DataFrame
        Text blocs as a pandas DataFrame.
    """

    lines = self.extractor(pdf)

    for key, value in context.items():
        lines[key] = value

    # Apply transformation
    if self.transform is not None:
        lines = self.transform(lines)

    return lines
__call__(pdf, **context)

Process the PDF document.

PARAMETER DESCRIPTION
pdf

Byte representation of the PDF document.

TYPE: bytes

context : Any Any contextual information that is used by the classifier (eg document type or source).

RETURNS DESCRIPTION
Dict[str, str]

Dictionary containing the aggregated text.

Source code in edspdf/readers/reader.py
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
def __call__(
    self, pdf: bytes, **context: Any
) -> Union[Dict[str, str], Tuple[Dict[str, str], Dict[str, Any]]]:
    """
    Process the PDF document.

    Parameters
    ----------
    pdf : bytes
        Byte representation of the PDF document.

    context : Any
        Any contextual information that is used by the classifier
        (eg document type or source).

    Returns
    -------
    Dict[str, str]
        Dictionary containing the aggregated text.
    """
    lines = self.prepare_and_predict(pdf, **context)
    result = self.aggregator(lines)
    return result

extractors

pdfminer

PdfMinerExtractor

Bases: BaseExtractor

Extractor object. Given a PDF byte stream, produces a list of blocs.

PARAMETER DESCRIPTION
line_overlap

See PDFMiner documentation

TYPE: float DEFAULT: 0.5

char_margin

See PDFMiner documentation

TYPE: float DEFAULT: 2.0

line_margin

See PDFMiner documentation

TYPE: float DEFAULT: 0.5

word_margin

See PDFMiner documentation

TYPE: float DEFAULT: 0.1

boxes_flow

See PDFMiner documentation

TYPE: Optional[float] DEFAULT: 0.5

detect_vertical

See PDFMiner documentation

TYPE: bool DEFAULT: False

all_texts

See PDFMiner documentation

TYPE: bool DEFAULT: False

Source code in edspdf/extractors/pdfminer.py
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
@registry.extractors.register("pdfminer.v1")
class PdfMinerExtractor(BaseExtractor):
    """
    Extractor object. Given a PDF byte stream, produces a list of blocs.

    Parameters
    ----------
    line_overlap : float
        See PDFMiner documentation
    char_margin : float
        See PDFMiner documentation
    line_margin : float
        See PDFMiner documentation
    word_margin : float
        See PDFMiner documentation
    boxes_flow : Optional[float]
        See PDFMiner documentation
    detect_vertical : bool
        See PDFMiner documentation
    all_texts : bool
        See PDFMiner documentation
    """

    def __init__(
        self,
        line_overlap: float = 0.5,
        char_margin: float = 2.0,
        line_margin: float = 0.5,
        word_margin: float = 0.1,
        boxes_flow: Optional[float] = 0.5,
        detect_vertical: bool = False,
        all_texts: bool = False,
    ):

        self.laparams = LAParams(
            line_overlap=line_overlap,
            char_margin=char_margin,
            line_margin=line_margin,
            word_margin=word_margin,
            boxes_flow=boxes_flow,
            detect_vertical=detect_vertical,
            all_texts=all_texts,
        )

    def generate_lines(self, pdf: bytes) -> pd.DataFrame:
        """
        Generates dataframe from all blocs in the PDF.

        Arguments
        ---------
        pdf:
            Byte stream representing the PDF.

        Returns
        -------
        pd.DataFrame :
            DataFrame representing the blocs.
        """

        pdf_stream = BytesIO(pdf)

        layout = extract_pages(pdf_stream, laparams=self.laparams)
        lines = list(get_lines(layout))

        if not lines:
            return pd.DataFrame(
                columns=[
                    "page",
                    "bloc",
                    "x0",
                    "x1",
                    "y0",
                    "y1",
                    "page_width",
                    "page_height",
                    "text",
                    "styles",
                ]
            )

        df = pd.DataFrame.from_records([line.dict() for line in lines])
        df["line_id"] = range(len(df))

        return df

    def extract(self, pdf: bytes) -> pd.DataFrame:
        """
        Process a single PDF document.

        Parameters
        ----------
        pdf : bytes
            Raw byte representation of the PDF document.

        Returns
        -------
        pd.DataFrame
            DataFrame containing one row for each line extracted using PDFMiner.
        """

        lines = self.generate_lines(pdf)

        # Remove empty lines
        lines = lines[lines.text.str.len() > 0]

        # Remove lines that are outside the page
        lines = remove_outside_lines(lines, strict_mode=True)

        return lines
generate_lines(pdf)

Generates dataframe from all blocs in the PDF.

Arguments

pdf: Byte stream representing the PDF.

RETURNS DESCRIPTION
pd.DataFrame

DataFrame representing the blocs.

Source code in edspdf/extractors/pdfminer.py
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
def generate_lines(self, pdf: bytes) -> pd.DataFrame:
    """
    Generates dataframe from all blocs in the PDF.

    Arguments
    ---------
    pdf:
        Byte stream representing the PDF.

    Returns
    -------
    pd.DataFrame :
        DataFrame representing the blocs.
    """

    pdf_stream = BytesIO(pdf)

    layout = extract_pages(pdf_stream, laparams=self.laparams)
    lines = list(get_lines(layout))

    if not lines:
        return pd.DataFrame(
            columns=[
                "page",
                "bloc",
                "x0",
                "x1",
                "y0",
                "y1",
                "page_width",
                "page_height",
                "text",
                "styles",
            ]
        )

    df = pd.DataFrame.from_records([line.dict() for line in lines])
    df["line_id"] = range(len(df))

    return df
extract(pdf)

Process a single PDF document.

PARAMETER DESCRIPTION
pdf

Raw byte representation of the PDF document.

TYPE: bytes

RETURNS DESCRIPTION
pd.DataFrame

DataFrame containing one row for each line extracted using PDFMiner.

Source code in edspdf/extractors/pdfminer.py
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
def extract(self, pdf: bytes) -> pd.DataFrame:
    """
    Process a single PDF document.

    Parameters
    ----------
    pdf : bytes
        Raw byte representation of the PDF document.

    Returns
    -------
    pd.DataFrame
        DataFrame containing one row for each line extracted using PDFMiner.
    """

    lines = self.generate_lines(pdf)

    # Remove empty lines
    lines = lines[lines.text.str.len() > 0]

    # Remove lines that are outside the page
    lines = remove_outside_lines(lines, strict_mode=True)

    return lines

functional

get_blocs(layout)

Extract text blocs from a PDFMiner layout generator.

Arguments

layout: PDFMiner layout generator.

YIELDS DESCRIPTION
bloc

Text bloc

TYPE: Iterator[Tuple[LTTextBoxHorizontal, int, float, float]]

Source code in edspdf/extractors/functional.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
def get_blocs(
    layout: Iterator[LTPage],
) -> Iterator[Tuple[LTTextBoxHorizontal, int, float, float]]:
    """
    Extract text blocs from a PDFMiner layout generator.

    Arguments
    ---------
    layout:
        PDFMiner layout generator.

    Yields
    ------
    bloc :
        Text bloc
    """

    for i, page in enumerate(layout):

        width = page.width
        height = page.height

        for bloc in page:
            if isinstance(bloc, LTTextBoxHorizontal):
                yield bloc, i, width, height

get_lines(layout)

Extract lines from a PDFMiner layout object.

The line is reframed such that the origin is the top left corner.

PARAMETER DESCRIPTION
layout

PDFMiner layout object.

TYPE: Iterator[LTPage]

YIELDS DESCRIPTION
Iterator[Line]

Single line object.

Source code in edspdf/extractors/functional.py
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
def get_lines(layout: Iterator[LTPage]) -> Iterator[Line]:
    """
    Extract lines from a PDFMiner layout object.

    The line is reframed such that the origin is the top left corner.

    Parameters
    ----------
    layout : Iterator[LTPage]
        PDFMiner layout object.

    Yields
    -------
    Iterator[Line]
        Single line object.
    """
    for b, (bloc, p, w, h) in enumerate(get_blocs(layout)):
        for line in bloc:
            text, styles = extract_style(line, width=w, height=h)
            yield Line(
                page=p,
                bloc=b,
                x0=line.x0 / w,
                x1=line.x1 / w,
                y0=1 - line.y1 / h,
                y1=1 - line.y0 / h,
                page_width=w,
                page_height=h,
                text=text,
                styles=styles,
            )

remove_outside_lines(lines, strict_mode=False)

Filter out lines that are outside the canvas.

PARAMETER DESCRIPTION
lines

Dataframe of extracted lines

TYPE: pd.DataFrame

strict_mode

Whether to remove the line if any part of it is outside the canvas, by default False

TYPE: bool, optional DEFAULT: False

RETURNS DESCRIPTION
pd.DataFrame

Filtered lines.

Source code in edspdf/extractors/functional.py
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
def remove_outside_lines(
    lines: pd.DataFrame,
    strict_mode: bool = False,
) -> pd.DataFrame:
    """
    Filter out lines that are outside the canvas.

    Parameters
    ----------
    lines : pd.DataFrame
        Dataframe of extracted lines
    strict_mode : bool, optional
        Whether to remove the line if any part of it is outside the canvas,
        by default False

    Returns
    -------
    pd.DataFrame
        Filtered lines.
    """
    if strict_mode:
        lower = lines[["x0", "y0"]].min(axis=1) >= 0
        upper = lines[["x1", "y1"]].max(axis=1) <= 1
        lines = lines[lower & upper]
    else:
        below = lines[["x1", "y1"]].max(axis=1) < 0
        above = lines[["x0", "y0"]].min(axis=1) > 0
        lines = lines[~(below | above)]
    return lines

base

BaseExtractor

Bases: ABC

Source code in edspdf/extractors/base.py
 6
 7
 8
 9
10
11
12
13
14
class BaseExtractor(ABC):
    @abstractmethod
    def extract(self, pdf: bytes) -> pd.DataFrame:
        """
        Handles the extraction
        """

    def __call__(self, pdf: bytes) -> pd.DataFrame:
        return self.extract(pdf)
extract(pdf) abstractmethod

Handles the extraction

Source code in edspdf/extractors/base.py
 7
 8
 9
10
11
@abstractmethod
def extract(self, pdf: bytes) -> pd.DataFrame:
    """
    Handles the extraction
    """

style

models

BaseStyle

Bases: BaseModel

Model acting as an abstraction for a style.

Source code in edspdf/extractors/style/models.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
class BaseStyle(BaseModel):
    """
    Model acting as an abstraction for a style.
    """

    fontname: Optional[str] = None

    font: str
    style: str
    size: float
    upright: bool

    x0: float
    x1: float
    y0: float
    y1: float
Style

Bases: BaseStyle

Model acting as an abstraction for a style.

Source code in edspdf/extractors/style/models.py
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
class Style(BaseStyle):
    """
    Model acting as an abstraction for a style.
    """

    @classmethod
    def from_fontname(
        cls,
        fontname: str,
        size: float,
        upright: bool,
        x0: float,
        x1: float,
        y0: float,
        y1: float,
    ) -> "Style":
        """
        Constructor using the compound `fontname` representation.

        Parameters
        ----------
        fontname : str
            Compound description of the font. Often `Arial`,
            `Arial,Bold` or `Arial-Bold`
        size : float
            Character size.
        upright : bool
            Whether the character is upright.

        Returns
        -------
        Style
            Style representation.
        """
        # Round the size to avoid floating point aberrations.
        size = round(size, 2)

        s = SEP_PATTERN.split(fontname)

        font = s.pop(0)

        if s:
            style = s[-1]
        else:
            style = "Normal"

        s = Style(
            fontname=fontname,
            font=font,
            style=style,
            size=size,
            upright=upright,
            x0=x0,
            x1=x1,
            y0=y0,
            y1=y1,
        )

        return s

    @classmethod
    def from_char(
        cls,
        char: LTChar,
        width: float,
        height: float,
    ):
        return cls.from_fontname(
            fontname=char.fontname,
            size=char.size,
            upright=char.upright,
            x0=char.x0 / width,
            x1=char.x1 / width,
            y0=1 - char.y1 / height,
            y1=1 - char.y0 / height,
        )

    def __eq__(self, other: "Style") -> bool:
        """
        Computes equality between two styles.

        Parameters
        ----------
        other : Style
            Style object to compare.

        Returns
        -------
        bool
            Whether the two styles are equal.
        """

        s = (self.font, self.style, round(self.size, 2), self.upright)
        o = (other.font, other.style, round(other.size, 2), other.upright)

        return s == o

    def __add__(self, other: "Style") -> "Style":

        if self != other:
            raise ValueError("You cannot add two different styles")

        st = self.copy()

        st.x0 = min(self.x0, other.x0)
        st.x1 = max(self.x1, other.x1)
        st.y0 = min(self.y0, other.y0)
        st.y1 = max(self.y1, other.y1)

        return st
from_fontname(fontname, size, upright, x0, x1, y0, y1) classmethod

Constructor using the compound fontname representation.

PARAMETER DESCRIPTION
fontname

Compound description of the font. Often Arial, Arial,Bold or Arial-Bold

TYPE: str

size

Character size.

TYPE: float

upright

Whether the character is upright.

TYPE: bool

RETURNS DESCRIPTION
Style

Style representation.

Source code in edspdf/extractors/style/models.py
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
@classmethod
def from_fontname(
    cls,
    fontname: str,
    size: float,
    upright: bool,
    x0: float,
    x1: float,
    y0: float,
    y1: float,
) -> "Style":
    """
    Constructor using the compound `fontname` representation.

    Parameters
    ----------
    fontname : str
        Compound description of the font. Often `Arial`,
        `Arial,Bold` or `Arial-Bold`
    size : float
        Character size.
    upright : bool
        Whether the character is upright.

    Returns
    -------
    Style
        Style representation.
    """
    # Round the size to avoid floating point aberrations.
    size = round(size, 2)

    s = SEP_PATTERN.split(fontname)

    font = s.pop(0)

    if s:
        style = s[-1]
    else:
        style = "Normal"

    s = Style(
        fontname=fontname,
        font=font,
        style=style,
        size=size,
        upright=upright,
        x0=x0,
        x1=x1,
        y0=y0,
        y1=y1,
    )

    return s
__eq__(other)

Computes equality between two styles.

PARAMETER DESCRIPTION
other

Style object to compare.

TYPE: Style

RETURNS DESCRIPTION
bool

Whether the two styles are equal.

Source code in edspdf/extractors/style/models.py
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
def __eq__(self, other: "Style") -> bool:
    """
    Computes equality between two styles.

    Parameters
    ----------
    other : Style
        Style object to compare.

    Returns
    -------
    bool
        Whether the two styles are equal.
    """

    s = (self.font, self.style, round(self.size, 2), self.upright)
    o = (other.font, other.style, round(other.size, 2), other.upright)

    return s == o
StyledText

Bases: BaseModel

Abstraction of a word, containing the style and the text.

Source code in edspdf/extractors/style/models.py
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
class StyledText(BaseModel):
    """
    Abstraction of a word, containing the style and the text.
    """

    text: str
    style: Style

    @classmethod
    def from_char(
        cls,
        char: LTChar,
        width: float,
        height: float,
    ):
        return StyledText(
            text=SPACE_PATTERN.sub(" ", char._text),
            style=Style.from_char(char, width=width, height=height),
        )

    def add_space(self) -> None:
        self.text = f"{self.text.rstrip()} "

    def rstrip(self) -> None:
        self.text = self.text.rstrip()

    def __add__(self, other: "StyledText") -> "StyledText":

        st = StyledText(
            text=self.text + other.text,
            style=self.style + other.style,
        )

        return st

    def __iadd__(self, other: "StyledText") -> "StyledText":
        return self + other

classifiers

mask

MaskClassifier

Bases: BaseClassifier

Mask classifier, that reproduces the PdfBox behaviour.

Source code in edspdf/classifiers/mask.py
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
class MaskClassifier(BaseClassifier):
    """
    Mask classifier, that reproduces the PdfBox behaviour.
    """

    def __init__(
        self,
        *ms: Mask,
    ) -> None:

        masks = list(ms)

        masks.append(Mask(label="pollution"))

        self.comparison = pd.DataFrame.from_records([mask.dict() for mask in masks])

    def predict(self, lines: pd.DataFrame) -> pd.Series:

        df = align_labels(lines, self.comparison)

        return df.label

align

align_labels(lines, labels, threshold=0.0001)

Align lines with possibly overlapping (and non-exhaustive) labels.

Possible matches are sorted by covered area. Lines with no overlap at all

PARAMETER DESCRIPTION
lines

DataFrame containing the lines

TYPE: pd.DataFrame

labels

DataFrame containing the labels

TYPE: pd.DataFrame

threshold

Threshold to use for discounting a label. Used if the labels DataFrame does not provide a threshold column, or to fill NaN values thereof.

TYPE: float, default 1 DEFAULT: 0.0001

RETURNS DESCRIPTION
pd.DataFrame

A copy of the lines table, with the labels added.

Source code in edspdf/classifiers/align.py
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
def align_labels(
    lines: pd.DataFrame,
    labels: pd.DataFrame,
    threshold: float = 0.0001,
) -> pd.DataFrame:
    """
    Align lines with possibly overlapping (and non-exhaustive) labels.

    Possible matches are sorted by covered area. Lines with no overlap at all

    Parameters
    ----------
    lines : pd.DataFrame
        DataFrame containing the lines
    labels : pd.DataFrame
        DataFrame containing the labels
    threshold : float, default 1
        Threshold to use for discounting a label. Used if the `labels` DataFrame
        does not provide a `threshold` column, or to fill `NaN` values thereof.

    Returns
    -------
    pd.DataFrame
        A copy of the lines table, with the labels added.
    """

    lines["uid"] = range(len(lines))

    df = lines[
        sorted({"uid", "page", "x0", "y0", "x1", "y1"} & set(lines.columns))
    ].copy()
    labels = labels.copy()

    if "threshold" not in labels.columns:
        labels["threshold"] = threshold

    labels.threshold = labels.threshold.fillna(threshold)

    df = df.merge(
        labels, how="inner" if set(df.columns) & set(labels.columns) else "cross"
    )

    df["dx"] = df[["x1", "X1"]].min(axis=1) - df[["x0", "X0"]].max(axis=1)
    df["dy"] = df[["y1", "Y1"]].min(axis=1) - df[["y0", "Y0"]].max(axis=1)

    df["overlap"] = (df.dx > 0) * (df.dy > 0) * df.dx * df.dy

    df["area"] = (df.x1 - df.x0) * (df.y1 - df.y0)
    df["ratio"] = df.overlap / df.area

    df["area_mask"] = (df.X1 - df.X0) * (df.Y1 - df.Y0)
    df["ratio_mask"] = df.overlap / df.area_mask

    df["thresholded"] = df.ratio >= df.threshold

    df = df.sort_values(["thresholded", "ratio_mask"], ascending=False)

    df = df.groupby(["uid"], as_index=False).first()
    df = df.sort_values("uid").reset_index(drop=True)

    df.label = df.label.where(df.thresholded)

    df = lines.merge(df[["uid", "label"]], on="uid").drop(columns=["uid"])
    lines.drop(columns="uid", inplace=True)

    return df

random

RandomClassifier

Bases: BaseClassifier

Random classifier, for chaos purposes. Classifies each line to a random element.

Source code in edspdf/classifiers/random.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
@registry.classifiers.register("random.v1")
class RandomClassifier(BaseClassifier):
    """
    Random classifier, for chaos purposes. Classifies each line to a random element.
    """

    def __init__(
        self,
        classes: Union[List[str], Dict[str, float]],
        seed: Optional[int] = 0,
    ) -> None:

        if isinstance(classes, list):
            classes = {c: 1 for c in classes}

        self.classes = {c: w / sum(classes.values()) for c, w in classes.items()}

        self.rgn = np.random.default_rng(seed=seed)

    def predict(self, lines: pd.DataFrame) -> List[str]:
        choices = self.rgn.choice(
            list(self.classes.keys()),
            p=list(self.classes.values()),
            size=len(lines),
        )

        return list(choices)

dummy

DummyClassifier

Bases: BaseClassifier

"Dummy" classifier, for testing purposes. Classifies every line to body.

Source code in edspdf/classifiers/dummy.py
10
11
12
13
14
15
16
17
@registry.classifiers.register("dummy.v1")
class DummyClassifier(BaseClassifier):
    """
    "Dummy" classifier, for testing purposes. Classifies every line to ``body``.
    """

    def predict(self, lines: pd.DataFrame) -> List[str]:
        return ["body"] * len(lines)

base

BaseClassifier

Bases: ABC

Source code in edspdf/classifiers/base.py
 7
 8
 9
10
11
12
13
14
15
class BaseClassifier(ABC):
    @abstractmethod
    def predict(self, lines: pd.DataFrame) -> List[str]:
        """
        Handles the classification.
        """

    def __call__(self, lines: pd.DataFrame) -> List[str]:
        return self.predict(lines)
predict(lines) abstractmethod

Handles the classification.

Source code in edspdf/classifiers/base.py
 8
 9
10
11
12
@abstractmethod
def predict(self, lines: pd.DataFrame) -> List[str]:
    """
    Handles the classification.
    """