Skip to content

edsnlp.utils

colors

CATEGORY20 = ['#1f77b4', '#aec7e8', '#ff7f0e', '#ffbb78', '#2ca02c', '#98df8a', '#d62728', '#ff9896', '#9467bd', '#c5b0d5', '#8c564b', '#c49c94', '#e377c2', '#f7b6d2', '#7f7f7f', '#c7c7c7', '#bcbd22', '#dbdb8d', '#17becf', '#9edae5'] module-attribute

create_colors(labels)

Assign a colour for each label, using category20 palette. The method loops over the colour palette in case there are too many labels.

PARAMETER DESCRIPTION
labels

List of labels to colorise in displacy.

TYPE: List[str]

RETURNS DESCRIPTION
Dict[str, str]

A displacy-compatible colour assignment.

Source code in edsnlp/utils/colors.py
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
def create_colors(labels: List[str]) -> Dict[str, str]:
    """
    Assign a colour for each label, using category20 palette.
    The method loops over the colour palette in case there are too many labels.

    Parameters
    ----------
    labels : List[str]
        List of labels to colorise in displacy.

    Returns
    -------
    Dict[str, str]
        A displacy-compatible colour assignment.
    """

    colors = {label: cat for label, cat in zip(labels, cycle(CATEGORY20))}

    return colors

deprecation

deprecated_extension(name, new_name)

Source code in edsnlp/utils/deprecation.py
 9
10
11
12
13
14
15
16
def deprecated_extension(name: str, new_name: str) -> None:
    msg = (
        f'The extension "{name}" is deprecated and will be '
        "removed in a future version. "
        f'Please use "{new_name}" instead.'
    )

    logger.warning(msg)

deprecated_getter_factory(name, new_name)

Source code in edsnlp/utils/deprecation.py
19
20
21
22
23
24
25
26
27
28
29
def deprecated_getter_factory(name: str, new_name: str) -> Callable:
    def getter(toklike: Union[Token, Span, Doc]) -> Any:

        n = f"{type(toklike).__name__}._.{name}"
        nn = f"{type(toklike).__name__}._.{new_name}"

        deprecated_extension(n, nn)

        return getattr(toklike._, new_name)

    return getter

deprecation(name, new_name=None)

Source code in edsnlp/utils/deprecation.py
32
33
34
35
36
37
38
39
40
41
42
def deprecation(name: str, new_name: Optional[str] = None):

    new_name = new_name or f"eds.{name}"

    msg = (
        f'Calling "{name}" directly is deprecated and '
        "will be removed in a future version. "
        f'Please use "{new_name}" instead.'
    )

    logger.warning(msg)

deprecated_factory(name, new_name=None, default_config=None, func=None)

Execute the Language.factory method on a modified factory function. The modification adds a deprecation warning.

PARAMETER DESCRIPTION
name

The deprecated name for the pipeline

TYPE: str

new_name

The new name for the pipeline, which should be used, by default None

TYPE: Optional[str], optional DEFAULT: None

default_config

The configuration that should be passed to Language.factory, by default None

TYPE: Optional[Dict[str, Any]], optional DEFAULT: None

func

The function to decorate, by default None

TYPE: Optional[Callable], optional DEFAULT: None

RETURNS DESCRIPTION
Callable
Source code in edsnlp/utils/deprecation.py
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
def deprecated_factory(
    name: str,
    new_name: Optional[str] = None,
    default_config: Optional[Dict[str, Any]] = None,
    func: Optional[Callable] = None,
) -> Callable:
    """
    Execute the Language.factory method on a modified factory function.
    The modification adds a deprecation warning.

    Parameters
    ----------
    name : str
        The deprecated name for the pipeline
    new_name : Optional[str], optional
        The new name for the pipeline, which should be used, by default None
    default_config : Optional[Dict[str, Any]], optional
        The configuration that should be passed to Language.factory, by default None
    func : Optional[Callable], optional
        The function to decorate, by default None

    Returns
    -------
    Callable
    """

    if default_config is None:
        default_config = dict()

    wrapper = Language.factory(name, default_config=default_config)

    def wrap(factory):

        # Define decorator
        # We use micheles' decorator package to keep the same signature
        # See https://github.com/micheles/decorator/
        @decorator
        def decorate(
            f,
            *args,
            **kwargs,
        ):
            deprecation(name, new_name)
            return f(
                *args,
                **kwargs,
            )

        decorated = decorate(factory)

        wrapper(decorated)

        return factory

    if func is not None:
        return wrap(func)

    return wrap

examples

entity_pattern = re.compile('(<ent[^<>]*>[^<>]+</ent>)') module-attribute

text_pattern = re.compile('<ent.*>(.+)</ent>') module-attribute

modifiers_pattern = re.compile('<ent\\s?(.*)>.+</ent>') module-attribute

single_modifiers_pattern = regex.compile("(?P<key>[^\\s]+?)=((?P<value>[^\\s']+)|'(?P<value>.+)')") module-attribute

Match

Bases: BaseModel

Source code in edsnlp/utils/examples.py
 8
 9
10
11
12
class Match(BaseModel):
    start_char: int
    end_char: int
    text: str
    modifiers: str
start_char: int = None class-attribute
end_char: int = None class-attribute
text: str = None class-attribute
modifiers: str = None class-attribute

Modifier

Bases: BaseModel

Source code in edsnlp/utils/examples.py
15
16
17
class Modifier(BaseModel):
    key: str
    value: Union[int, float, bool, str]
key: str = None class-attribute
value: Union[int, float, bool, str] = None class-attribute

Entity

Bases: BaseModel

Source code in edsnlp/utils/examples.py
20
21
22
23
class Entity(BaseModel):
    start_char: int
    end_char: int
    modifiers: List[Modifier]
start_char: int = None class-attribute
end_char: int = None class-attribute
modifiers: List[Modifier] = None class-attribute

find_matches(example)

Finds entities within the example.

PARAMETER DESCRIPTION
example

Example to process.

TYPE: str

RETURNS DESCRIPTION
List[re.Match]

List of matches for entities.

Source code in edsnlp/utils/examples.py
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
def find_matches(example: str) -> List[re.Match]:
    """
    Finds entities within the example.

    Parameters
    ----------
    example : str
        Example to process.

    Returns
    -------
    List[re.Match]
        List of matches for entities.
    """
    return list(entity_pattern.finditer(example))

parse_match(match)

Parse a regex match representing an entity.

PARAMETER DESCRIPTION
match

Match for an entity.

TYPE: re.Match

RETURNS DESCRIPTION
Match

Usable representation for the entity match.

Source code in edsnlp/utils/examples.py
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
def parse_match(match: re.Match) -> Match:
    """
    Parse a regex match representing an entity.

    Parameters
    ----------
    match : re.Match
        Match for an entity.

    Returns
    -------
    Match
        Usable representation for the entity match.
    """

    lexical_variant = match.group()
    start_char = match.start()
    end_char = match.end()

    text = text_pattern.findall(lexical_variant)[0]
    modifiers = modifiers_pattern.findall(lexical_variant)[0]

    m = Match(start_char=start_char, end_char=end_char, text=text, modifiers=modifiers)

    return m

parse_example(example)

Parses an example : finds examples and removes the tags.

PARAMETER DESCRIPTION
example

Example to process.

TYPE: str

RETURNS DESCRIPTION
Tuple[str, List[Entity]]

Cleaned text and extracted entities.

Source code in edsnlp/utils/examples.py
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
def parse_example(example: str) -> Tuple[str, List[Entity]]:
    """
    Parses an example : finds examples and removes the tags.

    Parameters
    ----------
    example : str
        Example to process.

    Returns
    -------
    Tuple[str, List[Entity]]
        Cleaned text and extracted entities.
    """

    matches = [parse_match(match) for match in find_matches(example=example)]
    text = ""
    entities = []

    cursor = 0

    for match in matches:

        text += example[cursor : match.start_char]
        start_char = len(text)
        text += match.text
        end_char = len(text)

        cursor = match.end_char

        entity = Entity(
            start_char=start_char,
            end_char=end_char,
            modifiers=[
                Modifier.parse_obj(m.groupdict())
                for m in single_modifiers_pattern.finditer(match.modifiers)
            ],
        )

        entities.append(entity)

    text += example[cursor:]

    return text, entities

filter

default_sort_key(span)

Returns the sort key for filtering spans.

PARAMETER DESCRIPTION
span

Span to sort.

TYPE: Span

RETURNS DESCRIPTION
key

Sort key.

TYPE: Tuple(int, int)

Source code in edsnlp/utils/filter.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
def default_sort_key(span: Span) -> Tuple[int, int]:
    """
    Returns the sort key for filtering spans.

    Parameters
    ----------
    span : Span
        Span to sort.

    Returns
    -------
    key : Tuple(int, int)
        Sort key.
    """
    if isinstance(span, tuple):
        span = span[0]
    return span.end - span.start, -span.start

start_sort_key(span)

Returns the sort key for filtering spans by start order.

PARAMETER DESCRIPTION
span

Span to sort.

TYPE: Span

RETURNS DESCRIPTION
key

Sort key.

TYPE: Tuple(int, int)

Source code in edsnlp/utils/filter.py
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
def start_sort_key(span: Union[Span, Tuple[Span, Any]]) -> Tuple[int, int]:
    """
    Returns the sort key for filtering spans by start order.

    Parameters
    ----------
    span : Span
        Span to sort.

    Returns
    -------
    key : Tuple(int, int)
        Sort key.
    """
    if isinstance(span, tuple):
        span = span[0]
    return span.start

filter_spans(spans, label_to_remove=None, return_discarded=False, sort_key=default_sort_key)

Re-definition of spacy's filtering function, that returns discarded spans as well as filtered ones.

Can also accept a label_to_remove argument, useful for filtering out pseudo cues. If set, results can contain overlapping spans: only spans overlapping with excluded labels are removed. The main expected use case is for pseudo-cues.

It can handle an iterable of tuples instead of an iterable of Spans. The primary use-case is the use with the RegexMatcher's capacity to return the span's groupdict.

The spaCy documentation states:

Filter a sequence of spans and remove duplicates or overlaps. Useful for creating named entities (where one token can only be part of one entity) or when merging spans with Retokenizer.merge. When spans overlap, the (first) longest span is preferred over shorter spans.

Filtering out spans

If the label_to_remove argument is supplied, it might be tempting to filter overlapping spans that are not part of a label to remove.

The reason we keep all other possibly overlapping labels is that in qualifier pipelines, the same cue can precede and follow a marked entity. Hence we need to keep every example.

PARAMETER DESCRIPTION
spans

Spans to filter.

TYPE: Iterable[Union["Span", Tuple["Span", Any]]]

return_discarded

Whether to return discarded spans.

TYPE: bool DEFAULT: False

label_to_remove

Label to remove. If set, results can contain overlapping spans.

TYPE: str, optional DEFAULT: None

sort_key

Key to sorting spans before applying overlap conflict resolution. A span with a higher key will have precedence over another span. By default, the largest, leftmost spans are selected first.

TYPE: Callable[Span, Any], optional DEFAULT: default_sort_key

RETURNS DESCRIPTION
results

Filtered spans

TYPE: List[Union[Span, Tuple[Span, Any]]]

discarded

Discarded spans

TYPE: List[Union[Span, Tuple[Span, Any]]], optional

Source code in edsnlp/utils/filter.py
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
def filter_spans(
    spans: Iterable[Union["Span", Tuple["Span", Any]]],
    label_to_remove: Optional[str] = None,
    return_discarded: bool = False,
    sort_key: Callable[[Span], Any] = default_sort_key,
) -> Union[
    List[Union[Span, Tuple[Span, Any]]],
    Tuple[List[Union[Span, Tuple[Span, Any]]], List[Union[Span, Tuple[Span, Any]]]],
]:
    """
    Re-definition of spacy's filtering function, that returns discarded spans
    as well as filtered ones.

    Can also accept a `label_to_remove` argument, useful for filtering out
    pseudo cues. If set, `results` can contain overlapping spans: only
    spans overlapping with excluded labels are removed. The main expected
    use case is for pseudo-cues.

    It can handle an iterable of tuples instead of an iterable of `Span`s.
    The primary use-case is the use with the `RegexMatcher`'s capacity to
    return the span's `groupdict`.

    !!! note ""

        The **spaCy documentation states**:

        > Filter a sequence of spans and remove duplicates or overlaps.
        > Useful for creating named entities (where one token can only
        > be part of one entity) or when merging spans with
        > `Retokenizer.merge`. When spans overlap, the (first)
        > longest span is preferred over shorter spans.

    !!! danger "Filtering out spans"

        If the `label_to_remove` argument is supplied, it might be tempting to
        filter overlapping spans that are not part of a label to remove.

        The reason we keep all other possibly overlapping labels is that in qualifier
        pipelines, the same cue can precede **and** follow a marked entity.
        Hence we need to keep every example.

    Parameters
    ----------
    spans : Iterable[Union["Span", Tuple["Span", Any]]]
        Spans to filter.
    return_discarded : bool
        Whether to return discarded spans.
    label_to_remove : str, optional
        Label to remove. If set, results can contain overlapping spans.
    sort_key : Callable[Span, Any], optional
        Key to sorting spans before applying overlap conflict resolution.
        A span with a higher key will have precedence over another span.
        By default, the largest, leftmost spans are selected first.

    Returns
    -------
    results : List[Union[Span, Tuple[Span, Any]]]
        Filtered spans
    discarded : List[Union[Span, Tuple[Span, Any]]], optional
        Discarded spans
    """
    sorted_spans = sorted(spans, key=sort_key, reverse=True)
    result = []
    discarded = []
    seen_tokens = set()
    for span in sorted_spans:
        s = span if isinstance(span, Span) else span[0]
        # Check for end - 1 here because boundaries are inclusive
        if s.start not in seen_tokens and s.end - 1 not in seen_tokens:
            if label_to_remove is None or s.label_ != label_to_remove:
                result.append(span)
            if label_to_remove is None or s.label_ == label_to_remove:
                seen_tokens.update(range(s.start, s.end))
        elif label_to_remove is None or s.label_ != label_to_remove:
            discarded.append(span)

    result = sorted(result, key=start_sort_key)
    discarded = sorted(discarded, key=start_sort_key)

    if return_discarded:
        return result, discarded

    return result

consume_spans(spans, filter, second_chance=None)

Consume a list of span, according to a filter.

Warning

This method makes the hard hypothesis that:

  1. Spans are sorted.
  2. Spans are consumed in sequence and only once.

The second item is problematic for the way we treat long entities, hence the second_chance parameter, which lets entities be seen more than once.

PARAMETER DESCRIPTION
spans

List of spans to filter

TYPE: List of spans

filter

Filtering function. Should return True when the item is to be included.

TYPE: Callable

second_chance

Optional list of spans to include again (useful for long entities), by default None

TYPE: List of spans, optional DEFAULT: None

RETURNS DESCRIPTION
matches

List of spans consumed by the filter.

TYPE: List of spans

remainder

List of remaining spans in the original spans parameter.

TYPE: List of spans

Source code in edsnlp/utils/filter.py
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
def consume_spans(
    spans: List[Span],
    filter: Callable,
    second_chance: Optional[List[Span]] = None,
) -> Tuple[List[Span], List[Span]]:
    """
    Consume a list of span, according to a filter.

    !!! warning
        This method makes the hard hypothesis that:

        1. Spans are sorted.
        2. Spans are consumed in sequence and only once.

        The second item is problematic for the way we treat long entities,
        hence the `second_chance` parameter, which lets entities be seen
        more than once.

    Parameters
    ----------
    spans : List of spans
        List of spans to filter
    filter : Callable
        Filtering function. Should return True when the item is to be included.
    second_chance : List of spans, optional
        Optional list of spans to include again (useful for long entities),
        by default None

    Returns
    -------
    matches : List of spans
        List of spans consumed by the filter.
    remainder : List of spans
        List of remaining spans in the original `spans` parameter.
    """

    if not second_chance:
        second_chance = []
    else:
        second_chance = [m for m in second_chance if filter(m)]

    if not spans:
        return second_chance, []

    for i, span in enumerate(spans):
        if not filter(span):
            break
        else:
            i += 1

    matches = spans[:i]
    remainder = spans[i:]

    matches.extend(second_chance)

    return matches, remainder

get_spans(spans, label)

Extracts spans with a given label. Prefer using hash label for performance reasons.

PARAMETER DESCRIPTION
spans

List of spans to filter.

TYPE: List[Span]

label

Label to filter on.

TYPE: Union[int, str]

RETURNS DESCRIPTION
List[Span]

Filtered spans.

Source code in edsnlp/utils/filter.py
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
def get_spans(spans: List[Span], label: Union[int, str]) -> List[Span]:
    """
    Extracts spans with a given label.
    Prefer using hash label for performance reasons.

    Parameters
    ----------
    spans : List[Span]
        List of spans to filter.
    label : Union[int, str]
        Label to filter on.

    Returns
    -------
    List[Span]
        Filtered spans.
    """
    if isinstance(label, int):
        return [span for span in spans if span.label == label]
    else:
        return [span for span in spans if span.label_ == label]

resources

get_verbs(verbs=None, check_contains=True)

Extract verbs from the resources, as a pandas dataframe.

PARAMETER DESCRIPTION
verbs

List of verbs to keep. Returns all verbs by default.

TYPE: List[str], optional DEFAULT: None

check_contains

Whether to check that no verb is missing if a list of verbs was provided. By default True

TYPE: bool, optional DEFAULT: True

RETURNS DESCRIPTION
pd.DataFrame

DataFrame containing conjugated verbs.

Source code in edsnlp/utils/resources.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
def get_verbs(
    verbs: Optional[List[str]] = None, check_contains: bool = True
) -> pd.DataFrame:
    """
    Extract verbs from the resources, as a pandas dataframe.

    Parameters
    ----------
    verbs : List[str], optional
        List of verbs to keep. Returns all verbs by default.
    check_contains : bool, optional
        Whether to check that no verb is missing if a list of verbs was provided.
        By default True

    Returns
    -------
    pd.DataFrame
        DataFrame containing conjugated verbs.
    """

    conjugated_verbs = pd.read_csv(BASE_DIR / "resources" / "verbs.csv")

    if not verbs:
        return conjugated_verbs

    verbs = set(verbs)

    selected_verbs = conjugated_verbs[conjugated_verbs.verb.isin(verbs)]

    if check_contains:
        assert len(verbs) == selected_verbs.verb.nunique(), "Some verbs are missing !"

    return selected_verbs

regex

make_pattern(patterns, with_breaks=False, name=None)

Create OR pattern from a list of patterns.

PARAMETER DESCRIPTION
patterns

List of patterns to merge.

TYPE: List[str]

with_breaks

Whether to add breaks (\b) on each side, by default False

TYPE: bool, optional DEFAULT: False

name

Name of the group, using regex ?P<> directive.

TYPE: Optional[str] DEFAULT: None

RETURNS DESCRIPTION
str

Merged pattern.

Source code in edsnlp/utils/regex.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
def make_pattern(
    patterns: List[str],
    with_breaks: bool = False,
    name: Optional[str] = None,
) -> str:
    r"""
    Create OR pattern from a list of patterns.

    Parameters
    ----------
    patterns : List[str]
        List of patterns to merge.
    with_breaks : bool, optional
        Whether to add breaks (`\b`) on each side, by default False
    name: str, optional
        Name of the group, using regex `?P<>` directive.

    Returns
    -------
    str
        Merged pattern.
    """

    if name:
        prefix = f"(?P<{name}>"
    else:
        prefix = "("

    # Sorting by length might be more efficient
    patterns.sort(key=len, reverse=True)

    pattern = prefix + "|".join(patterns) + ")"

    if with_breaks:
        pattern = r"\b" + pattern + r"\b"

    return pattern

compile_regex(reg)

This function tries to compile reg using the re module, and fallbacks to the regex module that is more permissive.

PARAMETER DESCRIPTION
reg

RETURNS DESCRIPTION
Union[re.Pattern, regex.Pattern]
Source code in edsnlp/utils/regex.py
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
def compile_regex(reg):
    """
    This function tries to compile `reg`  using the `re` module, and
    fallbacks to the `regex` module that is more permissive.

    Parameters
    ----------
    reg: str

    Returns
    -------
    Union[re.Pattern, regex.Pattern]
    """
    try:
        return re.compile(reg)
    except re.error:
        try:
            return regex.compile(reg)
        except regex.error:
            raise Exception("Could not compile: {}".format(repr(reg)))

inclusion

check_inclusion(span, start, end)

Checks whether the span overlaps the boundaries.

PARAMETER DESCRIPTION
span

Span to check.

TYPE: Span

start

Start of the boundary

TYPE: int

end

End of the boundary

TYPE: int

RETURNS DESCRIPTION
bool

Whether the span overlaps the boundaries.

Source code in edsnlp/utils/inclusion.py
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
def check_inclusion(span: Span, start: int, end: int) -> bool:
    """
    Checks whether the span overlaps the boundaries.

    Parameters
    ----------
    span : Span
        Span to check.
    start : int
        Start of the boundary
    end : int
        End of the boundary

    Returns
    -------
    bool
        Whether the span overlaps the boundaries.
    """

    if span.start >= end or span.end <= start:
        return False
    return True

blocs

Utility that extracts code blocs and runs them.

Largely inspired by https://github.com/koaning/mktestdocs

BLOCK_PATTERN = re.compile('((?P<skip><!-- no-check -->)\\s+)?(?P<indent> *)```(?P<title>.*?)\\n(?P<code>.+?)```', flags=re.DOTALL) module-attribute

OUTPUT_PATTERN = '# Out: ' module-attribute

check_outputs(code)

Looks for output patterns, and modifies the bloc:

  1. The preceding line becomes v = expr
  2. The output line becomes an assert statement
PARAMETER DESCRIPTION
code

Code block

TYPE: str

RETURNS DESCRIPTION
str

Modified code bloc with assert statements

Source code in edsnlp/utils/blocs.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
def check_outputs(code: str) -> str:
    """
    Looks for output patterns, and modifies the bloc:

    1. The preceding line becomes `#!python v = expr`
    2. The output line becomes an `#!python assert` statement

    Parameters
    ----------
    code : str
        Code block

    Returns
    -------
    str
        Modified code bloc with assert statements
    """

    lines: List[str] = code.split("\n")
    code = []

    skip = False

    if len(lines) < 2:
        return code

    for expression, output in zip(lines[:-1], lines[1:]):
        if skip:
            skip = not skip
            continue

        if output.startswith(OUTPUT_PATTERN):
            expression = f"v = {expression}"

            output = output[len(OUTPUT_PATTERN) :].replace('"', r"\"")
            output = f'assert repr(v) == "{output}" or str(v) == "{output}"'

            code.append(expression)
            code.append(output)

            skip = True

        else:
            code.append(expression)

    if not skip:
        code.append(output)

    return "\n".join(code)

remove_indentation(code, indent)

Remove indentation from a code bloc.

PARAMETER DESCRIPTION
code

Code bloc

TYPE: str

indent

Level of indentation

TYPE: int

RETURNS DESCRIPTION
str

Modified code bloc

Source code in edsnlp/utils/blocs.py
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
def remove_indentation(code: str, indent: int) -> str:
    """
    Remove indentation from a code bloc.

    Parameters
    ----------
    code : str
        Code bloc
    indent : int
        Level of indentation

    Returns
    -------
    str
        Modified code bloc
    """

    if not indent:
        return code

    lines = []

    for line in code.split("\n"):
        lines.append(line[indent:])

    return "\n".join(lines)

grab_code_blocks(docstring, lang='python')

Given a docstring, grab all the markdown codeblocks found in docstring.

PARAMETER DESCRIPTION
docstring

Full text.

TYPE: str

lang

Language to execute, by default "python"

TYPE: str, optional DEFAULT: 'python'

RETURNS DESCRIPTION
List[str]

Extracted code blocks

Source code in edsnlp/utils/blocs.py
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
def grab_code_blocks(docstring: str, lang="python") -> List[str]:
    """
    Given a docstring, grab all the markdown codeblocks found in docstring.

    Parameters
    ----------
    docstring : str
        Full text.
    lang : str, optional
        Language to execute, by default "python"

    Returns
    -------
    List[str]
        Extracted code blocks
    """
    codeblocks = []

    for match in BLOCK_PATTERN.finditer(docstring):
        d = match.groupdict()

        if d["skip"]:
            continue

        if lang in d["title"]:
            code = remove_indentation(d["code"], len(d["indent"]))
            code = check_outputs(code)
            codeblocks.append(code)

    return codeblocks

printer(code)

Prints a code bloc with lines for easier debugging.

PARAMETER DESCRIPTION
code

Code bloc.

TYPE: str

Source code in edsnlp/utils/blocs.py
132
133
134
135
136
137
138
139
140
141
142
143
144
145
def printer(code: str) -> None:
    """
    Prints a code bloc with lines for easier debugging.

    Parameters
    ----------
    code : str
        Code bloc.
    """
    lines = []
    for i, line in enumerate(code.split("\n")):
        lines.append(f"{i + 1:03}  {line}")

    print("\n".join(lines))

check_docstring(obj, lang='')

Given a function, test the contents of the docstring.

Source code in edsnlp/utils/blocs.py
148
149
150
151
152
153
154
155
156
157
158
def check_docstring(obj, lang=""):
    """
    Given a function, test the contents of the docstring.
    """
    for b in grab_code_blocks(obj.__doc__, lang=lang):
        try:
            exec(b, {"__MODULE__": "__main__"})
        except Exception:
            print(f"Error Encountered in `{obj.__name__}`. Caused by:\n")
            printer(b)
            raise

check_raw_string(raw, lang='python')

Given a raw string, test the contents.

Source code in edsnlp/utils/blocs.py
161
162
163
164
165
166
167
168
169
170
def check_raw_string(raw, lang="python"):
    """
    Given a raw string, test the contents.
    """
    for b in grab_code_blocks(raw, lang=lang):
        try:
            exec(b, {"__MODULE__": "__main__"})
        except Exception:
            printer(b)
            raise

check_raw_file_full(raw, lang='python')

Source code in edsnlp/utils/blocs.py
173
174
175
176
177
178
179
def check_raw_file_full(raw, lang="python"):
    all_code = "\n".join(grab_code_blocks(raw, lang=lang))
    try:
        exec(all_code, {"__MODULE__": "__main__"})
    except Exception:
        printer(all_code)
        raise

check_md_file(path, memory=False)

Given a markdown file, parse the contents for Python code blocs and check that each independant bloc does not cause an error.

PARAMETER DESCRIPTION
path

Path to the markdown file to execute.

TYPE: Path

memory

Whether to keep results from one bloc to the next, by default False

TYPE: bool, optional DEFAULT: False

Source code in edsnlp/utils/blocs.py
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
def check_md_file(path: Path, memory: bool = False) -> None:
    """
    Given a markdown file, parse the contents for Python code blocs
    and check that each independant bloc does not cause an error.

    Parameters
    ----------
    path : Path
        Path to the markdown file to execute.
    memory : bool, optional
        Whether to keep results from one bloc to the next, by default `#!python False`
    """
    text = Path(path).read_text()
    if memory:
        check_raw_file_full(text, lang="python")
    else:
        check_raw_string(text, lang="python")
Back to top