Skip to content

edsnlp.pipelines.core.normalizer.pollution.patterns

information = "(?s)(=====+\\s*)?(L\\s*e\\s*s\\sdonnées\\s*administratives,\\s*sociales\\s*|I?nfo\\s*rmation\\s*aux?\\s*patients?|L[’']AP-HP\\s*collecte\\s*vos\\s*données\\s*administratives|L[’']Assistance\\s*Publique\\s*-\\s*Hôpitaux\\s*de\\s*Paris\\s*\\(?AP-HP\\)?\\s*a\\s*créé\\s*une\\s*base\\s*de\\s*données).{,2000}https?:\\/\\/recherche\\.aphp\\.fr\\/eds\\/droit-opposition[\\s\\.]*" module-attribute

bars = '(?i)([nbw]|_|-|=){5,}' module-attribute

biology = '(\\b.*[|¦].*\\n)+' module-attribute

doctors = '(?mi)(^((dr)|(pr))(\\.|\\s|of).*)+' module-attribute

web = '(www\\.\\S*)|(\\S*@\\S*)' module-attribute

coding = '.*?[a-zA-Z]\\d{2,4}.*?(\\n|[a-zA-Z]\\d{2,4})' module-attribute

date = '\\b\\d\\d/\\d\\d/\\d\\d\\d\\d\\b' module-attribute

ipp = '80\\d{8}' module-attribute

page = '((^\\d\\/\\d\\s?)|(^\\d\\d?\\/\\d\\d\\?))' module-attribute

footer = '(?i)({page}.*\\n?pat.*(ipp)?.*\\n?(courrier valid.*)?)' module-attribute

pollution = dict(information=information, bars=bars, biology=biology, doctors=doctors, web=web, coding=coding, footer=footer) module-attribute