Skip to content

Tobacco consumption

The eds.tobacco pipeline component extracts mentions of tobacco consumption.

Details of the used patterns

# fmt: off
PA = r"(?:\bp/?a\b|paquets?.?annee)"
QUANTITY = r"(?P<quantity>[\d]{1,3})"
PUNCT = r"\.,-;\(\)"

default_patterns = dict(
    source="tobacco",
    regex=[
        r"tabagi",
        r"tabac",
        r"\bfume\b",
        r"\bfumeu",
        r"\bpipes?\b",
    ],
    exclude=dict(
        regex=[
            "occasion",
            "moder",
            "quelqu",
            "festi",
            "rare",
            "sujet",  # Example : Chez le sujet fumeur ... generic sentences
        ],
        window=(-3, 5),
    ),
    regex_attr="NORM",
    assign=[
        dict(
            name="stopped",
            regex=r"(?<!non )(?<!pas )(\bex\b|sevr|arret|stop|ancien)",
            window=(-3, 15),
        ),
        dict(
            name="zero_after",
            regex=r"^[a-z]*\s*:?[\s-]*(0|non(?! sevr))",
            window=6,
        ),
        dict(
            name="PA",
            regex=rf"{QUANTITY}[^{PUNCT}]{{0,10}}{PA}|{PA}[^{PUNCT}]{{0,10}}{QUANTITY}",
            window=(-10, 10),
            reduce_mode="keep_first",
        ),
        dict(
            name="secondhand",
            regex="(passif)",
            window=5,
            reduce_mode="keep_first",
        ),
    ],
)
# fmt: on

Extensions

On each span span that match, the following attributes are available:

  • span._.detailled_status: set to either
    • "PRESENT"
    • "ABSTINENCE" if the patient stopped its consumption
    • "ABSENT" if the patient has no tobacco dependence
  • span._.assigned: dictionary with the following keys, if relevant:
    • PA: the mentionned year-pack (= paquet-année)
    • secondhand: if secondhand smoking

Usage

import spacy

nlp = spacy.blank("eds")
nlp.add_pipe("eds.sentences")
nlp.add_pipe(
    "eds.normalizer",
    config=dict(
        accents=True,
        lowercase=True,
        quotes=True,
        spaces=True,
        pollution=dict(
            information=True,
            bars=True,
            biology=True,
            doctors=True,
            web=True,
            coding=True,
            footer=True,
        ),
    ),
)
nlp.add_pipe(f"eds.tobacco")

Below are a few examples:

text = "Tabagisme évalué à 15 PA"
doc = nlp(text)
spans = doc.spans["tobacco"]

spans
# Out: [Tabagisme évalué à 15 PA]

span = spans[0]

span._.assigned
# Out: {'PA': 15}
text = "Patient tabagique"
doc = nlp(text)
spans = doc.spans["tobacco"]

spans
# Out: [tabagique]
text = "Tabagisme festif"
doc = nlp(text)
spans = doc.spans["tobacco"]

spans
# Out: []
text = "On a un tabagisme ancien"
doc = nlp(text)
spans = doc.spans["tobacco"]

spans
# Out: [tabagisme ancien]

span = spans[0]

span._.detailled_status
# Out: ABSTINENCE

span._.assigned
# Out: {'stopped': [ancien]}
text = "Tabac: 0"
doc = nlp(text)
spans = doc.spans["tobacco"]

spans
# Out: [Tabac: 0]

span = spans[0]

span._.detailled_status
# Out: ABSENT

span._.assigned
# Out: {'zero_after': [0]}
text = "Tabagisme passif"
doc = nlp(text)
spans = doc.spans["tobacco"]

spans
# Out: [Tabagisme passif]

span = spans[0]

span._.detailled_status
# Out: ABSENT

span._.assigned
# Out: {'secondhand': passif}
text = "Tabac: sevré depuis 5 ans"
doc = nlp(text)
spans = doc.spans["tobacco"]

spans
# Out: [Tabac: sevré]

span = spans[0]

span._.detailled_status
# Out: ABSTINENCE

span._.assigned
# Out: {'stopped': [sevré]}

Authors and citation

The eds.tobacco component was developed by AP-HP's Data Science team with a team of medical experts. A paper describing in details the development of those components is being drafted and will soon be available.