`edspdf`

`loading`

`load(path)`

Load a complete pipeline.

TODO: implement other ways to load a pipeline.

PARAMETER DESCRIPTION

path

Path to the pipeline.

TYPE: Path

RETURNS	DESCRIPTION
`PdfReader`	A PdfReader object.

Source code in edspdf/loading.py

def load(path: Path) -> PdfReader:
    """
    Load a complete pipeline.

    TODO: implement other ways to load a pipeline.

    Parameters
    ----------
    path : Path
        Path to the pipeline.

    Returns
    -------
    PdfReader
        A PdfReader object.
    """
    conf = Config().from_disk(path)
    return registry.resolve(conf)["reader"]

`from_str(config)`

Load a complete pipeline from a string config.

PARAMETER DESCRIPTION

config

Configuration.

TYPE: str

RETURNS	DESCRIPTION
`PdfReader`	A PdfReader object.

Source code in edspdf/loading.py

def from_str(config: str) -> PdfReader:
    """
    Load a complete pipeline from a string config.

    Parameters
    ----------
    config : str
        Configuration.

    Returns
    -------
    PdfReader
        A PdfReader object.
    """
    conf = Config().from_str(config)
    return registry.resolve(conf)["reader"]

`classifiers`

`align`

`align_labels(lines, labels, threshold=0.0001)`

Align lines with possibly overlapping (and non-exhaustive) labels.

Possible matches are sorted by covered area. Lines with no overlap at all

PARAMETER DESCRIPTION

lines

DataFrame containing the lines

TYPE: pd.DataFrame

labels

DataFrame containing the labels

TYPE: pd.DataFrame

threshold

Threshold to use for discounting a label. Used if the labels DataFrame does not provide a threshold column, or to fill NaN values thereof.

TYPE: float, default 1 DEFAULT: 0.0001

RETURNS	DESCRIPTION
`pd.DataFrame`	A copy of the lines table, with the labels added.

Source code in edspdf/classifiers/align.py

def align_labels(
    lines: pd.DataFrame,
    labels: pd.DataFrame,
    threshold: float = 0.0001,
) -> pd.DataFrame:
    """
    Align lines with possibly overlapping (and non-exhaustive) labels.

    Possible matches are sorted by covered area. Lines with no overlap at all

    Parameters
    ----------
    lines : pd.DataFrame
        DataFrame containing the lines
    labels : pd.DataFrame
        DataFrame containing the labels
    threshold : float, default 1
        Threshold to use for discounting a label. Used if the `labels` DataFrame
        does not provide a `threshold` column, or to fill `NaN` values thereof.

    Returns
    -------
    pd.DataFrame
        A copy of the lines table, with the labels added.
    """

    lines["uid"] = range(len(lines))

    df = lines[
        sorted({"uid", "page", "x0", "y0", "x1", "y1"} & set(lines.columns))
    ].copy()
    labels = labels.copy()

    if "threshold" not in labels.columns:
        labels["threshold"] = threshold

    labels.threshold = labels.threshold.fillna(threshold)

    df = df.merge(
        labels, how="inner" if set(df.columns) & set(labels.columns) else "cross"
    )

    df["dx"] = df[["x1", "X1"]].min(axis=1) - df[["x0", "X0"]].max(axis=1)
    df["dy"] = df[["y1", "Y1"]].min(axis=1) - df[["y0", "Y0"]].max(axis=1)

    df["overlap"] = (df.dx > 0) * (df.dy > 0) * df.dx * df.dy

    df["area"] = (df.x1 - df.x0) * (df.y1 - df.y0)
    df["ratio"] = df.overlap / df.area

    df["area_mask"] = (df.X1 - df.X0) * (df.Y1 - df.Y0)
    df["ratio_mask"] = df.overlap / df.area_mask

    df["thresholded"] = df.ratio >= df.threshold

    df = df.sort_values(["thresholded", "ratio_mask"], ascending=False)

    df = df.groupby(["uid"], as_index=False).first()
    df = df.sort_values("uid").reset_index(drop=True)

    df.label = df.label.where(df.thresholded)

    df = lines.merge(df[["uid", "label"]], on="uid").drop(columns=["uid"])
    lines.drop(columns="uid", inplace=True)

    return df

`random`

`RandomClassifier`

Bases: BaseClassifier

Random classifier, for chaos purposes. Classifies each line to a random element.

Source code in edspdf/classifiers/random.py

@registry.classifiers.register("random.v1")
class RandomClassifier(BaseClassifier):
    """
    Random classifier, for chaos purposes. Classifies each line to a random element.
    """

    def __init__(
        self,
        classes: Union[List[str], Dict[str, float]],
        seed: Optional[int] = 0,
    ) -> None:

        if isinstance(classes, list):
            classes = {c: 1 for c in classes}

        self.classes = {c: w / sum(classes.values()) for c, w in classes.items()}

        self.rgn = np.random.default_rng(seed=seed)

    def predict(self, lines: pd.DataFrame) -> List[str]:
        choices = self.rgn.choice(
            list(self.classes.keys()),
            p=list(self.classes.values()),
            size=len(lines),
        )

        return list(choices)

`mask`

`MaskClassifier`

Bases: BaseClassifier

Mask classifier, that reproduces the PdfBox behaviour.

Source code in edspdf/classifiers/mask.py

class MaskClassifier(BaseClassifier):
    """
    Mask classifier, that reproduces the PdfBox behaviour.
    """

    def __init__(
        self,
        *ms: Mask,
    ) -> None:

        masks = list(ms)

        masks.append(Mask(label="pollution"))

        self.comparison = pd.DataFrame.from_records([mask.dict() for mask in masks])

    def predict(self, lines: pd.DataFrame) -> pd.Series:

        df = align_labels(lines, self.comparison)

        return df.label

`dummy`

`DummyClassifier`

Bases: BaseClassifier

"Dummy" classifier, for testing purposes. Classifies every line to body.

Source code in edspdf/classifiers/dummy.py

@registry.classifiers.register("dummy.v1")
class DummyClassifier(BaseClassifier):
    """
    "Dummy" classifier, for testing purposes. Classifies every line to ``body``.
    """

    def predict(self, lines: pd.DataFrame) -> List[str]:
        return ["body"] * len(lines)

`base`

`BaseClassifier`

Bases: ABC

Source code in edspdf/classifiers/base.py

class BaseClassifier(ABC):
    @abstractmethod
    def predict(self, lines: pd.DataFrame) -> List[str]:
        """
        Handles the classification.
        """

    def __call__(self, lines: pd.DataFrame) -> List[str]:
        return self.predict(lines)

`predict(lines)` `abstractmethod`

Handles the classification.

Source code in edspdf/classifiers/base.py

@abstractmethod
def predict(self, lines: pd.DataFrame) -> List[str]:
    """
    Handles the classification.
    """

`extractors`

`functional`

`get_blocs(layout)`

Extract text blocs from a PDFMiner layout generator.

Arguments

layout: PDFMiner layout generator.

YIELDS DESCRIPTION

bloc

Text bloc

TYPE: Iterator[Tuple[LTTextBoxHorizontal, int, float, float]]

Source code in edspdf/extractors/functional.py

def get_blocs(
    layout: Iterator[LTPage],
) -> Iterator[Tuple[LTTextBoxHorizontal, int, float, float]]:
    """
    Extract text blocs from a PDFMiner layout generator.

    Arguments
    ---------
    layout:
        PDFMiner layout generator.

    Yields
    ------
    bloc :
        Text bloc
    """

    for i, page in enumerate(layout):

        width = page.width
        height = page.height

        for bloc in page:
            if isinstance(bloc, LTTextBoxHorizontal):
                yield bloc, i, width, height

`get_lines(layout)`

Extract lines from a PDFMiner layout object.

The line is reframed such that the origin is the top left corner.

PARAMETER DESCRIPTION

layout

PDFMiner layout object.

TYPE: Iterator[LTPage]

YIELDS	DESCRIPTION
`Iterator[Line]`	Single line object.

Source code in edspdf/extractors/functional.py

def get_lines(layout: Iterator[LTPage]) -> Iterator[Line]:
    """
    Extract lines from a PDFMiner layout object.

    The line is reframed such that the origin is the top left corner.

    Parameters
    ----------
    layout : Iterator[LTPage]
        PDFMiner layout object.

    Yields
    -------
    Iterator[Line]
        Single line object.
    """
    for b, (bloc, p, w, h) in enumerate(get_blocs(layout)):
        for line in bloc:
            text, styles = extract_style(line, width=w, height=h)
            yield Line(
                page=p,
                bloc=b,
                x0=line.x0 / w,
                x1=line.x1 / w,
                y0=1 - line.y1 / h,
                y1=1 - line.y0 / h,
                page_width=w,
                page_height=h,
                text=text,
                styles=styles,
            )

`remove_outside_lines(lines, strict_mode=False)`

Filter out lines that are outside the canvas.

PARAMETER DESCRIPTION

lines

Dataframe of extracted lines

TYPE: pd.DataFrame

strict_mode

Whether to remove the line if any part of it is outside the canvas, by default False

TYPE: bool, optional DEFAULT: False

RETURNS	DESCRIPTION
`pd.DataFrame`	Filtered lines.

Source code in edspdf/extractors/functional.py

def remove_outside_lines(
    lines: pd.DataFrame,
    strict_mode: bool = False,
) -> pd.DataFrame:
    """
    Filter out lines that are outside the canvas.

    Parameters
    ----------
    lines : pd.DataFrame
        Dataframe of extracted lines
    strict_mode : bool, optional
        Whether to remove the line if any part of it is outside the canvas,
        by default False

    Returns
    -------
    pd.DataFrame
        Filtered lines.
    """
    if strict_mode:
        lower = lines[["x0", "y0"]].min(axis=1) >= 0
        upper = lines[["x1", "y1"]].max(axis=1) <= 1
        lines = lines[lower & upper]
    else:
        below = lines[["x1", "y1"]].max(axis=1) < 0
        above = lines[["x0", "y0"]].min(axis=1) > 0
        lines = lines[~(below | above)]
    return lines

`base`

`BaseExtractor`

Bases: ABC

Source code in edspdf/extractors/base.py

class BaseExtractor(ABC):
    @abstractmethod
    def extract(self, pdf: bytes) -> pd.DataFrame:
        """
        Handles the extraction
        """

    def __call__(self, pdf: bytes) -> pd.DataFrame:
        return self.extract(pdf)

`extract(pdf)` `abstractmethod`

Handles the extraction

Source code in edspdf/extractors/base.py

@abstractmethod
def extract(self, pdf: bytes) -> pd.DataFrame:
    """
    Handles the extraction
    """

`pdfminer`

`PdfMinerExtractor`

Bases: BaseExtractor

Extractor object. Given a PDF byte stream, produces a list of blocs.

PARAMETER	DESCRIPTION
`line_overlap`	See PDFMiner documentation TYPE: `float` DEFAULT: `0.5`
`char_margin`	See PDFMiner documentation TYPE: `float` DEFAULT: `2.0`
`line_margin`	See PDFMiner documentation TYPE: `float` DEFAULT: `0.5`
`word_margin`	See PDFMiner documentation TYPE: `float` DEFAULT: `0.1`
`boxes_flow`	See PDFMiner documentation TYPE: `Optional[float]` DEFAULT: `0.5`
`detect_vertical`	See PDFMiner documentation TYPE: `bool` DEFAULT: `False`
`all_texts`	See PDFMiner documentation TYPE: `bool` DEFAULT: `False`

Source code in edspdf/extractors/pdfminer.py

@registry.extractors.register("pdfminer.v1")
class PdfMinerExtractor(BaseExtractor):
    """
    Extractor object. Given a PDF byte stream, produces a list of blocs.

    Parameters
    ----------
    line_overlap : float
        See PDFMiner documentation
    char_margin : float
        See PDFMiner documentation
    line_margin : float
        See PDFMiner documentation
    word_margin : float
        See PDFMiner documentation
    boxes_flow : Optional[float]
        See PDFMiner documentation
    detect_vertical : bool
        See PDFMiner documentation
    all_texts : bool
        See PDFMiner documentation
    """

    def __init__(
        self,
        line_overlap: float = 0.5,
        char_margin: float = 2.0,
        line_margin: float = 0.5,
        word_margin: float = 0.1,
        boxes_flow: Optional[float] = 0.5,
        detect_vertical: bool = False,
        all_texts: bool = False,
    ):

        self.laparams = LAParams(
            line_overlap=line_overlap,
            char_margin=char_margin,
            line_margin=line_margin,
            word_margin=word_margin,
            boxes_flow=boxes_flow,
            detect_vertical=detect_vertical,
            all_texts=all_texts,
        )

    def generate_lines(self, pdf: bytes) -> pd.DataFrame:
        """
        Generates dataframe from all blocs in the PDF.

        Arguments
        ---------
        pdf:
            Byte stream representing the PDF.

        Returns
        -------
        pd.DataFrame :
            DataFrame representing the blocs.
        """

        pdf_stream = BytesIO(pdf)

        layout = extract_pages(pdf_stream, laparams=self.laparams)
        lines = list(get_lines(layout))

        if not lines:
            return pd.DataFrame(
                columns=[
                    "page",
                    "bloc",
                    "x0",
                    "x1",
                    "y0",
                    "y1",
                    "page_width",
                    "page_height",
                    "text",
                    "styles",
                ]
            )

        df = pd.DataFrame.from_records([line.dict() for line in lines])
        df["line_id"] = range(len(df))

        return df

    def extract(self, pdf: bytes) -> pd.DataFrame:
        """
        Process a single PDF document.

        Parameters
        ----------
        pdf : bytes
            Raw byte representation of the PDF document.

        Returns
        -------
        pd.DataFrame
            DataFrame containing one row for each line extracted using PDFMiner.
        """

        lines = self.generate_lines(pdf)

        # Remove empty lines
        lines = lines[lines.text.str.len() > 0]

        # Remove lines that are outside the page
        lines = remove_outside_lines(lines, strict_mode=True)

        return lines

`generate_lines(pdf)`

Generates dataframe from all blocs in the PDF.

Arguments

pdf: Byte stream representing the PDF.

RETURNS	DESCRIPTION
`pd.DataFrame`	DataFrame representing the blocs.

Source code in edspdf/extractors/pdfminer.py

def generate_lines(self, pdf: bytes) -> pd.DataFrame:
    """
    Generates dataframe from all blocs in the PDF.

    Arguments
    ---------
    pdf:
        Byte stream representing the PDF.

    Returns
    -------
    pd.DataFrame :
        DataFrame representing the blocs.
    """

    pdf_stream = BytesIO(pdf)

    layout = extract_pages(pdf_stream, laparams=self.laparams)
    lines = list(get_lines(layout))

    if not lines:
        return pd.DataFrame(
            columns=[
                "page",
                "bloc",
                "x0",
                "x1",
                "y0",
                "y1",
                "page_width",
                "page_height",
                "text",
                "styles",
            ]
        )

    df = pd.DataFrame.from_records([line.dict() for line in lines])
    df["line_id"] = range(len(df))

    return df

`extract(pdf)`

Process a single PDF document.

PARAMETER DESCRIPTION

pdf

Raw byte representation of the PDF document.

TYPE: bytes

RETURNS	DESCRIPTION
`pd.DataFrame`	DataFrame containing one row for each line extracted using PDFMiner.

Source code in edspdf/extractors/pdfminer.py

def extract(self, pdf: bytes) -> pd.DataFrame:
    """
    Process a single PDF document.

    Parameters
    ----------
    pdf : bytes
        Raw byte representation of the PDF document.

    Returns
    -------
    pd.DataFrame
        DataFrame containing one row for each line extracted using PDFMiner.
    """

    lines = self.generate_lines(pdf)

    # Remove empty lines
    lines = lines[lines.text.str.len() > 0]

    # Remove lines that are outside the page
    lines = remove_outside_lines(lines, strict_mode=True)

    return lines

`style`

`models`

`BaseStyle`

Bases: BaseModel

Model acting as an abstraction for a style.

Source code in edspdf/extractors/style/models.py

class BaseStyle(BaseModel):
    """
    Model acting as an abstraction for a style.
    """

    fontname: Optional[str] = None

    font: str
    style: str
    size: float
    upright: bool

    x0: float
    x1: float
    y0: float
    y1: float

`Style`

Bases: BaseStyle

Model acting as an abstraction for a style.

Source code in edspdf/extractors/style/models.py

class Style(BaseStyle):
    """
    Model acting as an abstraction for a style.
    """

    @classmethod
    def from_fontname(
        cls,
        fontname: str,
        size: float,
        upright: bool,
        x0: float,
        x1: float,
        y0: float,
        y1: float,
    ) -> "Style":
        """
        Constructor using the compound `fontname` representation.

        Parameters
        ----------
        fontname : str
            Compound description of the font. Often `Arial`,
            `Arial,Bold` or `Arial-Bold`
        size : float
            Character size.
        upright : bool
            Whether the character is upright.

        Returns
        -------
        Style
            Style representation.
        """
        # Round the size to avoid floating point aberrations.
        size = round(size, 2)

        s = SEP_PATTERN.split(fontname)

        font = s.pop(0)

        if s:
            style = s[-1]
        else:
            style = "Normal"

        s = Style(
            fontname=fontname,
            font=font,
            style=style,
            size=size,
            upright=upright,
            x0=x0,
            x1=x1,
            y0=y0,
            y1=y1,
        )

        return s

    @classmethod
    def from_char(
        cls,
        char: LTChar,
        width: float,
        height: float,
    ):
        return cls.from_fontname(
            fontname=char.fontname,
            size=char.size,
            upright=char.upright,
            x0=char.x0 / width,
            x1=char.x1 / width,
            y0=1 - char.y1 / height,
            y1=1 - char.y0 / height,
        )

    def __eq__(self, other: "Style") -> bool:
        """
        Computes equality between two styles.

        Parameters
        ----------
        other : Style
            Style object to compare.

        Returns
        -------
        bool
            Whether the two styles are equal.
        """

        s = (self.font, self.style, round(self.size, 2), self.upright)
        o = (other.font, other.style, round(other.size, 2), other.upright)

        return s == o

    def __add__(self, other: "Style") -> "Style":

        if self != other:
            raise ValueError("You cannot add two different styles")

        st = self.copy()

        st.x0 = min(self.x0, other.x0)
        st.x1 = max(self.x1, other.x1)
        st.y0 = min(self.y0, other.y0)
        st.y1 = max(self.y1, other.y1)

        return st

`from_fontname(fontname, size, upright, x0, x1, y0, y1)` `classmethod`

Constructor using the compound fontname representation.

PARAMETER DESCRIPTION

fontname

Compound description of the font. Often Arial, Arial,Bold or Arial-Bold

TYPE: str

size

Character size.

TYPE: float

upright

Whether the character is upright.

TYPE: bool

RETURNS	DESCRIPTION
`Style`	Style representation.

Source code in edspdf/extractors/style/models.py

@classmethod
def from_fontname(
    cls,
    fontname: str,
    size: float,
    upright: bool,
    x0: float,
    x1: float,
    y0: float,
    y1: float,
) -> "Style":
    """
    Constructor using the compound `fontname` representation.

    Parameters
    ----------
    fontname : str
        Compound description of the font. Often `Arial`,
        `Arial,Bold` or `Arial-Bold`
    size : float
        Character size.
    upright : bool
        Whether the character is upright.

    Returns
    -------
    Style
        Style representation.
    """
    # Round the size to avoid floating point aberrations.
    size = round(size, 2)

    s = SEP_PATTERN.split(fontname)

    font = s.pop(0)

    if s:
        style = s[-1]
    else:
        style = "Normal"

    s = Style(
        fontname=fontname,
        font=font,
        style=style,
        size=size,
        upright=upright,
        x0=x0,
        x1=x1,
        y0=y0,
        y1=y1,
    )

    return s

`eq(other)`

Computes equality between two styles.

PARAMETER DESCRIPTION

other

Style object to compare.

TYPE: Style

RETURNS	DESCRIPTION
`bool`	Whether the two styles are equal.

Source code in edspdf/extractors/style/models.py

def __eq__(self, other: "Style") -> bool:
    """
    Computes equality between two styles.

    Parameters
    ----------
    other : Style
        Style object to compare.

    Returns
    -------
    bool
        Whether the two styles are equal.
    """

    s = (self.font, self.style, round(self.size, 2), self.upright)
    o = (other.font, other.style, round(other.size, 2), other.upright)

    return s == o

`StyledText`

Bases: BaseModel

Abstraction of a word, containing the style and the text.

Source code in edspdf/extractors/style/models.py

class StyledText(BaseModel):
    """
    Abstraction of a word, containing the style and the text.
    """

    text: str
    style: Style

    @classmethod
    def from_char(
        cls,
        char: LTChar,
        width: float,
        height: float,
    ):
        return StyledText(
            text=SPACE_PATTERN.sub(" ", char._text),
            style=Style.from_char(char, width=width, height=height),
        )

    def add_space(self) -> None:
        self.text = f"{self.text.rstrip()} "

    def rstrip(self) -> None:
        self.text = self.text.rstrip()

    def __add__(self, other: "StyledText") -> "StyledText":

        st = StyledText(
            text=self.text + other.text,
            style=self.style + other.style,
        )

        return st

    def __iadd__(self, other: "StyledText") -> "StyledText":
        return self + other

`transforms`

`base`

`BaseTransform`

Bases: ABC

Source code in edspdf/transforms/base.py

class BaseTransform(ABC):
    @abstractmethod
    def transform(self, lines: pd.DataFrame) -> pd.DataFrame:
        """
        Handles the transformation
        """

    def __call__(self, lines: pd.DataFrame) -> pd.DataFrame:
        return self.transform(lines)

`transform(lines)` `abstractmethod`

Handles the transformation

Source code in edspdf/transforms/base.py

@abstractmethod
def transform(self, lines: pd.DataFrame) -> pd.DataFrame:
    """
    Handles the transformation
    """

`readers`

`reader`

`PdfReader`

Source code in edspdf/readers/reader.py

@registry.readers.register("pdf-reader.v1")
class PdfReader:
    def __init__(
        self,
        extractor: Optional[BaseExtractor] = None,
        classifier: Optional[BaseClassifier] = None,
        aggregator: Optional[BaseAggregator] = None,
        transform: Optional[BaseTransform] = None,
        meta_labels: Dict[str, str] = dict(),
    ) -> None:
        """
        Reads a text-based PDF document,

        Parameters
        ----------
        extractor : BaseExtractor
            Text bloc extractor.
        classifier : BaseClassifier
            Classifier model, to assign a section (eg `body`, `header`, etc).
        aggregator : BaseAggregator
            Aggregator model, to compile labelled text blocs together.
        transform : BaseTransform, optional
            Transformation to apply before classification.
        meta_labels : Dict[str, str], optional
            Dictionary of hierarchical labels
            (eg `table` is probably within the `body`).
        """

        self.extractor = extractor
        self.classifier = classifier
        self.aggregator = aggregator

        self.transform = transform
        self.meta_labels = meta_labels

    def predict(self, lines: pd.DataFrame) -> pd.DataFrame:
        """
        Predict the label of each text bloc.

        Parameters
        ----------
        lines : pd.DataFrame
            Text blocs to label.

        Returns
        -------
        pd.DataFrame
            Labelled text blocs.
        """

        lines["label"] = self.classifier.predict(lines)
        lines["meta_label"] = lines.label.replace(self.meta_labels)

        return lines

    def prepare_data(self, pdf: bytes, **context: Any) -> pd.DataFrame:
        """
        Prepare data before classification.
        Can also be used to generate the training dataset for the classifier.

        Parameters
        ----------
        pdf : bytes
            PDF document, as bytes.

        Returns
        -------
        pd.DataFrame
            Text blocs as a pandas DataFrame.
        """

        lines = self.extractor(pdf)

        for key, value in context.items():
            lines[key] = value

        # Apply transformation
        if self.transform is not None:
            lines = self.transform(lines)

        return lines

    def prepare_and_predict(self, pdf: bytes, **context: Any) -> pd.DataFrame:
        lines = self.prepare_data(pdf, **context)
        lines = self.predict(lines)
        return lines

    def __call__(
        self, pdf: bytes, **context: Any
    ) -> Union[Dict[str, str], Tuple[Dict[str, str], Dict[str, Any]]]:
        """
        Process the PDF document.

        Parameters
        ----------
        pdf : bytes
            Byte representation of the PDF document.

        context : Any
            Any contextual information that is used by the classifier
            (eg document type or source).

        Returns
        -------
        Dict[str, str]
            Dictionary containing the aggregated text.
        """
        lines = self.prepare_and_predict(pdf, **context)
        result = self.aggregator(lines)
        return result

`init(extractor=None, classifier=None, aggregator=None, transform=None, meta_labels=dict())`

Reads a text-based PDF document,

PARAMETER	DESCRIPTION
`extractor`	Text bloc extractor. TYPE: `BaseExtractor` DEFAULT: `None`
`classifier`	Classifier model, to assign a section (eg `body`, `header`, etc). TYPE: `BaseClassifier` DEFAULT: `None`
`aggregator`	Aggregator model, to compile labelled text blocs together. TYPE: `BaseAggregator` DEFAULT: `None`
`transform`	Transformation to apply before classification. TYPE: `BaseTransform, optional` DEFAULT: `None`
`meta_labels`	Dictionary of hierarchical labels (eg `table` is probably within the `body`). TYPE: `Dict[str, str], optional` DEFAULT: `dict()`

Source code in edspdf/readers/reader.py

def __init__(
    self,
    extractor: Optional[BaseExtractor] = None,
    classifier: Optional[BaseClassifier] = None,
    aggregator: Optional[BaseAggregator] = None,
    transform: Optional[BaseTransform] = None,
    meta_labels: Dict[str, str] = dict(),
) -> None:
    """
    Reads a text-based PDF document,

    Parameters
    ----------
    extractor : BaseExtractor
        Text bloc extractor.
    classifier : BaseClassifier
        Classifier model, to assign a section (eg `body`, `header`, etc).
    aggregator : BaseAggregator
        Aggregator model, to compile labelled text blocs together.
    transform : BaseTransform, optional
        Transformation to apply before classification.
    meta_labels : Dict[str, str], optional
        Dictionary of hierarchical labels
        (eg `table` is probably within the `body`).
    """

    self.extractor = extractor
    self.classifier = classifier
    self.aggregator = aggregator

    self.transform = transform
    self.meta_labels = meta_labels

`predict(lines)`

Predict the label of each text bloc.

PARAMETER DESCRIPTION

lines

Text blocs to label.

TYPE: pd.DataFrame

RETURNS	DESCRIPTION
`pd.DataFrame`	Labelled text blocs.

Source code in edspdf/readers/reader.py

def predict(self, lines: pd.DataFrame) -> pd.DataFrame:
    """
    Predict the label of each text bloc.

    Parameters
    ----------
    lines : pd.DataFrame
        Text blocs to label.

    Returns
    -------
    pd.DataFrame
        Labelled text blocs.
    """

    lines["label"] = self.classifier.predict(lines)
    lines["meta_label"] = lines.label.replace(self.meta_labels)

    return lines

`prepare_data(pdf, **context)`

Prepare data before classification. Can also be used to generate the training dataset for the classifier.

PARAMETER DESCRIPTION

pdf

PDF document, as bytes.

TYPE: bytes

RETURNS	DESCRIPTION
`pd.DataFrame`	Text blocs as a pandas DataFrame.

Source code in edspdf/readers/reader.py

def prepare_data(self, pdf: bytes, **context: Any) -> pd.DataFrame:
    """
    Prepare data before classification.
    Can also be used to generate the training dataset for the classifier.

    Parameters
    ----------
    pdf : bytes
        PDF document, as bytes.

    Returns
    -------
    pd.DataFrame
        Text blocs as a pandas DataFrame.
    """

    lines = self.extractor(pdf)

    for key, value in context.items():
        lines[key] = value

    # Apply transformation
    if self.transform is not None:
        lines = self.transform(lines)

    return lines

`call(pdf, **context)`

Process the PDF document.

PARAMETER DESCRIPTION

pdf

Byte representation of the PDF document.

TYPE: bytes

context : Any Any contextual information that is used by the classifier (eg document type or source).

RETURNS	DESCRIPTION
`Dict[str, str]`	Dictionary containing the aggregated text.

Source code in edspdf/readers/reader.py

def __call__(
    self, pdf: bytes, **context: Any
) -> Union[Dict[str, str], Tuple[Dict[str, str], Dict[str, Any]]]:
    """
    Process the PDF document.

    Parameters
    ----------
    pdf : bytes
        Byte representation of the PDF document.

    context : Any
        Any contextual information that is used by the classifier
        (eg document type or source).

    Returns
    -------
    Dict[str, str]
        Dictionary containing the aggregated text.
    """
    lines = self.prepare_and_predict(pdf, **context)
    result = self.aggregator(lines)
    return result

`aggregators`

`styled`

`StyledAggregator`

Bases: SimpleAggregator

Aggregator that returns text and styles.

Source code in edspdf/aggregators/styled.py

@registry.aggregators.register("styled.v1")
class StyledAggregator(SimpleAggregator):
    """
    Aggregator that returns text and styles.
    """

    def aggregate(
        self, lines: pd.DataFrame
    ) -> Tuple[Dict[str, str], Dict[str, List[Dict]]]:

        if len(lines) == 0:
            return {}, {}

        lines = lines.sort_values(["page", "y1", "x0"])
        lines["label"] = lines["label"].map(lambda l: self.label_map.get(l, l))

        lines["line_id"] = range(len(lines))

        styles = lines[["line_id", "styles"]].explode("styles").dropna().reset_index()
        styles = styles[["line_id"]].join(pd.json_normalize(styles.styles))

        lines = prepare_newlines(
            lines,
            nl_threshold=self.nl_threshold,
            np_threshold=self.np_threshold,
        )

        lines["offset"] = lines["text_with_newline"].str.len()
        lines["offset"] = lines.groupby(["label"])["offset"].transform("cumsum")
        lines["offset"] = lines.groupby(["label"])["offset"].transform("shift")
        lines["offset"] = lines["offset"].fillna(0).astype(int)

        styles = styles.merge(lines[["line_id", "offset", "label"]], on="line_id")
        styles["start"] += styles.offset
        styles["end"] += styles.offset

        df = lines.groupby(["label"]).agg(text=("text_with_newline", "sum"))

        text = df.text.to_dict()
        style = {
            label: styles.query("label == @label")
            .drop(columns=["line_id", "offset", "label"])
            .to_dict(orient="records")
            for label in text.keys()
        }

        return text, style

`base`

`BaseAggregator`

Bases: ABC

Source code in edspdf/aggregators/base.py

class BaseAggregator(ABC):
    @abstractmethod
    def aggregate(self, lines: pd.DataFrame) -> Dict[str, str]:
        """
        Handles the text aggregation
        """

    def __call__(self, lines: pd.DataFrame, copy: bool = False) -> Dict[str, str]:
        if copy:
            lines = lines.copy()
        return self.aggregate(lines)

`aggregate(lines)` `abstractmethod`

Handles the text aggregation

Source code in edspdf/aggregators/base.py

@abstractmethod
def aggregate(self, lines: pd.DataFrame) -> Dict[str, str]:
    """
    Handles the text aggregation
    """

edspdf

loading

load(path)

from_str(config)

classifiers

align

align_labels(lines, labels, threshold=0.0001)

random

RandomClassifier

mask

MaskClassifier

dummy

DummyClassifier

base

BaseClassifier

predict(lines) abstractmethod

extractors

functional

get_blocs(layout)

Arguments

get_lines(layout)

remove_outside_lines(lines, strict_mode=False)

base

BaseExtractor

extract(pdf) abstractmethod

pdfminer

PdfMinerExtractor

generate_lines(pdf)

Arguments

extract(pdf)

style

models

BaseStyle

Style

from_fontname(fontname, size, upright, x0, x1, y0, y1) classmethod

__eq__(other)

StyledText

transforms

base

BaseTransform

transform(lines) abstractmethod

readers

reader

PdfReader

__init__(extractor=None, classifier=None, aggregator=None, transform=None, meta_labels=dict())

predict(lines)

prepare_data(pdf, **context)

__call__(pdf, **context)

aggregators

styled

StyledAggregator

base

BaseAggregator

aggregate(lines) abstractmethod

`edspdf`

`loading`

`load(path)`

`from_str(config)`

`classifiers`

`align`

`align_labels(lines, labels, threshold=0.0001)`

`random`

`RandomClassifier`

`mask`

`MaskClassifier`

`dummy`

`DummyClassifier`

`base`

`BaseClassifier`

`predict(lines)` `abstractmethod`

`extractors`

`functional`

`get_blocs(layout)`

`get_lines(layout)`

`remove_outside_lines(lines, strict_mode=False)`

`base`

`BaseExtractor`

`extract(pdf)` `abstractmethod`

`pdfminer`

`PdfMinerExtractor`

`generate_lines(pdf)`

`extract(pdf)`

`style`

`models`

`BaseStyle`

`Style`

`from_fontname(fontname, size, upright, x0, x1, y0, y1)` `classmethod`

`eq(other)`

`StyledText`

`transforms`

`base`

`BaseTransform`

`transform(lines)` `abstractmethod`

`readers`

`reader`

`PdfReader`

`init(extractor=None, classifier=None, aggregator=None, transform=None, meta_labels=dict())`

`predict(lines)`

`prepare_data(pdf, **context)`

`call(pdf, **context)`

`aggregators`

`styled`

`StyledAggregator`

`base`

`BaseAggregator`

`aggregate(lines)` `abstractmethod`