-
Notifications
You must be signed in to change notification settings - Fork 6
/
deid-output.config
139 lines (124 loc) · 5.23 KB
/
deid-output.config
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# File: deid-output.config
# Example configuration file used by the de-identification
# software, deid.pl, to generate de-identified gold standard
# corpus. No performance statistics will be generated.
#
# Authors: Margaret Douglass, Ishna Neamatullah, William J. Long, Li-wei Lehman
# Last modified by Li-wei Lehman ([email protected]) Nov. 2007
#Description: This configuration file allows you to
# (1) set certain global variables (2) turn certain filters on/off,
# and (3) turn certain dictionaries on/off. The general format is
# <configuration string> = <value>
#
# The <value> is 'y'/'n' or '0'/'1' or some other values depending
# on the configuration string
# For example, for the configuration string "Gold standard comparison",
# set the value to either 0 or 1. See rest of this file for more examples.
# IMPORTANT: do not change the configuration string, as this might
# cause the software to not recognize the configuration setting.
########################################################################
#################Configure Comparison or Output Mode####################
# "Gold standard comparison = 0" for output mode.
# "Gold standard comparison = 1" for performance comparison mode; a
# gold standard corpus and a PHI list must be provided.
Gold standard comparison = 0
########################################################################
########Configure Date Related Variables for De-identification##########
# Date offset should be an integer than represents the number of days
# to date shift in re-identifying dates in the medical notes.
# This date offset will be applied to all patients. To use a different
# date shift for different patient, set "PID to date offset mapping"
# to 'y', and provide the mapping in a file called "shift.txt" in the
# same directory.
# Date offset is 0 in GS comparison mode, since we are not outputing
# any de-ided text with date shift. For output mode, set Date offset
# to a number of days that will be used for to date shift for all
# patients. Note that this offset is ignored if a PID to date offset file
# is available.
Date offset = 1000
# PID to date shift mappings: if set to 'y', the code will
# load patient-specific date-shift from file "shift.txt";
PID to date offset mapping = n
# Format for the default date should be MM/DD/YYYY
Date Default = 01/01/2020
# The "Two Digit Year Threshold" is used to determine whether
# to interpret the year as a year in the 1900's or 2000's.
# Must be a 1- or 2-digit number.
# Two digit years > Threshold are interepreted as in the 1900's
# Two digit years <= Threshold are interpreted as in the 2000's
# The following threshold is set according to the re-identified dates
# that appear in our gold standard corpus.
Two Digit Year Threshold = 30
########################################################################
##################Configure De-identification Filters:##################
# De-identification filters used:
# PHI categories filtered:
# 1. Social Security Numbers (SSN)
# 2. Uniform Resource Locators (URL)
# 3. Email addresses
# 4. Telephone/fax numbers
# 5. Provider/unit/medical record numbers
# 6. Ages over 90
# 7. Locations and hospital names
# 8. Dates
# 9. Names
# 10.U.S. States
#Note:
# GS (gold standard) filters patterns (e.g. ward names) specific to
# gold std corpus (which are nursing notes). The filter for DS should
# always be set to "n" for this distribution, as it applies only to
# patterns we see in our discharge summaries, which are not included
# in this distribution.
# Use 'y' to set the filter on or 'n' to turn the filter off
SSN filter = y
URL filter = y
Email filter = y
Telephone filter = y
Unit number filter = y
Age filter = y
Location filter = y
Date filter = y
Name filter = y
State filter = n
GS filter = y
DS filter = n
#########################################################################
#########Configure Dictionary Loading for De-identification #############
# Note: there are more dictionaries than listed here. The ones listed
# here are the ones we allow you to enable/disable the loading of the
# dictionaries for. Generic first/lastname dictionaries are always loaded.
# Lists used:
# 1. PID to patient name mappings: "lists/pid_patientname.txt";
# 2. Country names: "lists/countries_unambig.txt";
# 3. Company names:
# a) "lists/company_names_unambig.txt",
# b) "lists/company_names_ambig.txt".
# 4. Ethnicities: "lists/ethnicities_unambig.txt";
# 5. Hospitals: "lists/stripped_hospitals.txt";
# 6. Locations:;
# a) "lists/locations_unambig.txt",
# b) "lists/locations_ambig.txt",
# 7. LocalPlaces:
# a) "lists/local_places_unambig.txt",
# b) "lists/local_places_ambig.txt".
# 8. Doctor names:
# a) "lists/doctor_first_names.txt"
# b) "lists/doctor_last_names.txt"
# 9. US States:
# a) lists/us_states.txt
# b) lists/us_states_abbre.tx
# c) lists/more_us_state_abbreviations.txt"
#Configure lists/dictionaries:
# Use 'y' to load the dictionary or 'n' to not load the dictionary
# Note that we load the State dictionary for de-identification
# of patterns of zipcode and university/college names with
# State names in it.
PID to patient name mapping = y
Country names = n
Company names = y
Ethnicities = n
Hospital names = y
Location names = y
Doctor names = y
LocalPlaces names = y
State names = y