source: ccr2ccd-xslt/trunk/make/ProcFix.py@ 1806

Last change on this file since 1806 was 1004, checked in by George Lilly, 14 years ago

final PQRI files

File size: 13.0 KB
Line 
1################################################################################
2#
3# Process the PROCEDURES section of an intermediate CCR Level 3 XML file using
4# a set of rules.
5#
6# Usage: ProcFix codelistcsvfile inputxmlfile outputxmlfile
7#
8# Where:
9# codelistcsvfile: defines valid medical vocabulary codes for PROCEDURES
10# inputxmlfile: input file
11# outputxmlfile: output file (can be same name as input file)
12#
13# The process of producing quality measures XML for submission to CMS (as part
14# of Meaningful Use certification) is quite complex. Briefly, the steps are:
15#
16# 1. The VistA EHR outputs a CCR record (XML).
17#
18# 2. The CCR is transformed to CCD Level 3 format (XML) using XSLT.
19#
20# 3. Due to data vagaries (desire to limit patient-identifying information and
21# new Schematron validation directives introduced for 2011), additional
22# processing is necessary to output a final XML file for submission to CMS.
23#
24# A series of sed, awk, and other (such as this program) commands are
25# applied to slowly move the baseline CCD into a final format that will
26# successfully validate with CMS. Some of the intermediate "XML" files
27# produced during this multi-step transformation may not be well-formed,
28# which makes it difficult to use tools such as XSLT.
29#
30# This program only assumes that the XML for the PROCEDURES section will be
31# well-formed. Other sections of the input file may not be valid XML, but that
32# will not adversely affect program operation.
33#
34################################################################################
35
36import sys
37import csv
38
39################################################################################
40# This function extracts the code value and OID from the supplied line and
41# builds a code dictionary key of the form "OID:value"
42################################################################################
43def build_code_key(line):
44 # Extract code value and OID
45 blankless_line = line.replace(' ', '')
46 i1 = blankless_line.find('code="')
47 i1 = i1 + len('code="');
48 i2 = blankless_line.find('"', i1)
49 code_value = blankless_line[i1 : i2];
50 i1 = blankless_line.find('codeSystem="')
51 i1 = i1 + len('codeSystem="')
52 i2 = blankless_line.find('"', i1)
53 code_system = blankless_line[i1 : i2]
54
55 return code_system + ':' + code_value
56
57################################################################################
58# Read the CSV file, which is exported from Appendix G of the 2011 Downloadable
59# Resources table. This file defines the medical vocabulary codes that may be
60# used by the PROCEDURES section. File format:
61#
62# - Column 1 is the symbolic code set name
63# - Column 2 is the HL7 OID (object identifier number) for the code set
64# - Column 3 is the actual code value
65#
66# The symbolic code set name is defined by Appendix G using terminology such as
67# C4 to mean CPT-4, SNM to mean SNOMED-CT, LN to mean LOINC, and so on. The
68# VistA CCD output uses different symbolic names (for example, CPT-4 instead of
69# C4). The CMS validation rules do not validate symbolic code set names.
70#
71# Since there is no code set naming contract between VistA CCD output and this
72# program, it would be unwise to use observed values - so we will not check the
73# symbolic code name attributes.
74#
75# Per CMS 2011 rules, we must check the code set OID and the associated code
76# value. A dictionary is an easy way to manage the OID/value information.
77# The key will be OID + ':' + CodeValue, and the value will be CodeValue.
78################################################################################
79def create_code_dictionary(csvfilename):
80 csvfile = open(csvfilename, 'r')
81 codedict = {}
82 try:
83 reader = csv.reader(csvfile)
84 for row in reader:
85 key = row[1] + ":" + row[2]
86 codedict[key] = row[2]
87 finally:
88 csvfile.close()
89
90 return codedict
91
92################################################################################
93# Do a pattern match on the supplied line using the supplied patterns. Both
94# the line and patterns are processed (local copies) to remove all blanks. This
95# is necessary because XML attributes inside an element have arbitrary spacing
96# rules. For example,
97#
98# <element attr = "value"...
99# <element attr = "value"...
100# <element attr="value"
101#
102# Are all valid XML. Removing all whitespace makes sane pattern matching
103# possible.
104################################################################################
105def match(line, pat1, pat2="", pat3="", pat4=""):
106 blankless_line = line.replace(' ', '')
107 match_list = [pat1.replace(' ', '')]
108 if len(pat2) > 0:
109 match_list.append(pat2.replace(' ', ''))
110 if len(pat3) > 0:
111 match_list.append(pat3.replace(' ', ''))
112 if len(pat4) > 0:
113 match_list.append(pat4.replace(' ', ''))
114
115 for pattern in match_list:
116 if blankless_line.find(pattern) < 0:
117 return 0
118
119 return 1
120
121################################################################################
122# Find the start and end of the PROCEDURES section. According to the CMS 2011
123# specs, the structure for PROCEDURES is:
124#
125# <component>
126# <section>
127# <templateId root="2.16.840.1.113883.10.20.1.12"/>
128# <code code="47519-4" codeSystem="2.16.840.1.113883.6.1"/>
129# ...various PROCEDURES elements
130# </section>
131# </component>
132#
133# Note that there can be multiple templateId elements. However, the one given
134# above (2.16.840.1.113883.10.20.1.12) is mandatory and serves (in conjuction
135# with the code element) to identify the PROCEDURES section.
136#
137################################################################################
138def find_procedures_section(source):
139 start_index = 0
140 searching = 1
141 end_index = len(source) - 1
142 min_procedures_len = 6
143
144 while searching:
145 # If within minimum target size at the end, give up
146 if start_index + min_procedures_len >= end_index:
147 break
148
149 # No <component> element means no PROCEDURES. Quit searching.
150 patterns = ["<component>"]
151 component_start = find_line(start_index, end_index, patterns)
152 if component_start < 0:
153 break
154
155 # Missing <component> end means no PROCEDURES (really malformed XML).
156 # However, if have <component></component> on same line, then it's
157 # an empty component, so keep searching.
158 patterns = ["</component>"]
159 component_end = find_line(component_start, end_index, patterns)
160 if component_end < 0:
161 break
162 if component_start == component_end:
163 start_index = component_start + 1
164 continue
165
166 # Search for <section> element bounded by component. If not found,
167 # continue search after component end.
168 patterns = ["<section>"]
169 section_start = find_line(component_start+1, component_end, patterns)
170 if section_start < 0:
171 start_index = component_end + 1
172 continue
173
174 # Mission <section> end means malformed XML inside this <component>.
175 # Recover by continuing search after <component> end. Also continue
176 # search if <section></section> on same line.
177 patterns = ["</section>"]
178 section_end = find_line(section_start, component_end, patterns)
179 if section_end < 0 or section_start == section_end:
180 start_index = component_end + 1
181 continue
182
183 patterns = ["<templateId", 'root="2.16.840.1.113883.10.20.1.12"']
184 template_index = find_line(section_start+1, section_end, patterns)
185 if template_index < 0:
186 start_index = component_end + 1
187 continue
188
189 patterns = ["<code", \
190 'code="47519-4"', \
191 'codeSystem="2.16.840.1.113883.6.1"']
192 code_index = find_line(template_index+1, section_end, patterns)
193 if code_index > 0:
194 return (component_start, component_end)
195
196 return (-1, -1)
197
198################################################################################
199# Entry point: check command arguments and do the work
200################################################################################
201# Check for proper usage
202if len(sys.argv) != 3:
203 print('Usage: ' + sys.argv[0] + \
204 ' codelistcsvfile xmlfile')
205 sys.exit(1)
206
207# Create the medical code dictionary using supplied Appendix G data
208code_dictionary = create_code_dictionary(sys.argv[1])
209
210# Now read in the quasi-XML source file into a list.
211
212infile = open(sys.argv[2], 'r')
213source_lines = infile.readlines()
214infile.close()
215print("Processing ", sys.argv[2])
216
217# Open the output file and start processing the input. First, we're looking
218# for the start of the PROCEDURES section. According to the CMS 2011 specs,
219# the structure for PROCEDURES is:
220#
221# <component>
222# <section>
223# <templateId root="2.16.840.1.113883.10.20.1.12"/>
224# <code code="47519-4" codeSystem="2.16.840.1.113883.6.1"/>
225# ...various PROCEDURES elements
226# </section>
227# </component>
228#
229# Note that there can be multiple templateId elements. However, the one given
230# above (2.16.840.1.113883.10.20.1.12) is mandatory and serves (in conjuction
231# with the code element) to identify the PROCEDURES section.
232
233found_procedures = 0
234kill_procedures_text = 0
235finding_entry = 0
236found_entry = 0
237finding_procedures = 0
238
239# Process lines.
240line_index = -1
241procedures_start_index = -1
242procedures_end_index = -1
243entry_start_index = 0
244valid_entry_count = 0
245
246# Iterate over the source file lines
247for line in source_lines[:]:
248 line_index = line_index + 1
249
250 # Look for the start of the PROCEDURES section
251 if match(line, "<component>"):
252 finding_procedures = 1
253 procedures_start_index = line_index
254 elif finding_procedures and match(line, "<section>"):
255 finding_procedures = 1
256 elif finding_procedures and match(line, \
257 "<templateId", \
258 'root="2.16.840.1.113883.10.20.1.12"'):
259 finding_procedures = 1
260 elif finding_procedures and match(line, \
261 "<code", \
262 'code="47519-4"', \
263 'codeSystem="2.16.840.1.113883.6.1"'):
264 finding_procedures = 0
265 found_procedures = 1
266 kill_procedures_text = 1
267 finding_entry = 0
268 found_entry = 0
269 else:
270 finding_procedures = 0
271
272 if not found_procedures:
273 continue
274
275 # Look for end of the PROCEDURES section, which stops all special processing
276 if match(line, "</component>") and found_procedures:
277 if valid_entry_count == 0:
278 for n in range(procedures_start_index, line_index+1):
279 source_lines[n] = ""
280 break
281
282 # Replace a real <text> element with one that is empty. Only do this once
283 # and in the PROCEDURES header.
284 if kill_procedures_text and match(line, "<text>"):
285 text_start_index = line_index + 1
286 continue
287
288 if kill_procedures_text and match(line, "</text>"):
289 for n in range(text_start_index, line_index):
290 source_lines[n] = ""
291 kill_procedures_text = 0
292 continue
293
294 # Look for an entry element inside the PROCEDURES section. Looking for:
295 # <entry typeCode="DRIV">
296 # <procedure classCode="PROC" moodCode="EVN">
297 # <templateId root="2.16.840.1.113883.10.20.1.29" ... />
298 # ... (multiple templateId elements are permitted) ...
299 # <id root="PROCEDURE_4141_10" extension="CCRObjectID"/>
300 # <code displayName="..." code="99242"
301 # codeSystemName="CPT-4" codeSystem="2.16.840.1.113883.6.12"/>
302
303 if match(line, '<entry typeCode="DRIV">'):
304 finding_entry = 1
305 entry_start_index = line_index
306 elif finding_entry and \
307 match(line, '<procedure classCode="PROC"', 'moodCode="EVN">'):
308 finding_entry = 1
309 elif finding_entry and \
310 match(line, '<templateId root="2.16.840.1.113883.10.20.1.29"'):
311 finding_entry = 1
312 elif finding_entry and match(line, '<code'):
313 code_key = build_code_key(line)
314 finding_entry = 0
315 found_entry = 1
316
317 # If at end of the procedure entry, validate against the codes. Output if
318 # valid using the set-aside entry list. Otherwise output an empty line.
319 if found_entry and match(line, "</entry>"):
320 if code_key in code_dictionary:
321 valid_entry_count = valid_entry_count + 1
322 else:
323 for n in range(entry_start_index, line_index+1):
324 source_lines[n] = ""
325 print(" Deleting procedure entry with invalid code ", code_key)
326 finding_entry = 0
327 found_entry = 0
328
329# Output the modified source line array
330outfile = open(sys.argv[2], 'wt')
331for line in source_lines:
332 if len(line) > 0:
333 outfile.write(line)
334print(" Processing complete, valid entry count: ", valid_entry_count)
335if valid_entry_count == 0:
336 print(" Entire procedure section has been deleted")
337outfile.close()
338sys.exit(0)
339
340
341
342
343
344
345
346
347
348
349
350
351
Note: See TracBrowser for help on using the repository browser.