Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

ProcFix.py@ 1482

Last change on this file since 1482 was 1004, checked in by George Lilly, 14 years ago
final PQRI files
File size: 13.0 KB

Line
1	################################################################################
2	#
3	# Process the PROCEDURES section of an intermediate CCR Level 3 XML file using
4	# a set of rules.
5	#
6	# Usage: ProcFix codelistcsvfile inputxmlfile outputxmlfile
7	#
8	# Where:
9	# codelistcsvfile: defines valid medical vocabulary codes for PROCEDURES
10	# inputxmlfile: input file
11	# outputxmlfile: output file (can be same name as input file)
12	#
13	# The process of producing quality measures XML for submission to CMS (as part
14	# of Meaningful Use certification) is quite complex. Briefly, the steps are:
15	#
16	# 1. The VistA EHR outputs a CCR record (XML).
17	#
18	# 2. The CCR is transformed to CCD Level 3 format (XML) using XSLT.
19	#
20	# 3. Due to data vagaries (desire to limit patient-identifying information and
21	# new Schematron validation directives introduced for 2011), additional
22	# processing is necessary to output a final XML file for submission to CMS.
23	#
24	# A series of sed, awk, and other (such as this program) commands are
25	# applied to slowly move the baseline CCD into a final format that will
26	# successfully validate with CMS. Some of the intermediate "XML" files
27	# produced during this multi-step transformation may not be well-formed,
28	# which makes it difficult to use tools such as XSLT.
29	#
30	# This program only assumes that the XML for the PROCEDURES section will be
31	# well-formed. Other sections of the input file may not be valid XML, but that
32	# will not adversely affect program operation.
33	#
34	################################################################################
35
36	import sys
37	import csv
38
39	################################################################################
40	# This function extracts the code value and OID from the supplied line and
41	# builds a code dictionary key of the form "OID:value"
42	################################################################################
43	def build_code_key(line):
44	# Extract code value and OID
45	blankless_line = line.replace(' ', '')
46	i1 = blankless_line.find('code="')
47	i1 = i1 + len('code="');
48	i2 = blankless_line.find('"', i1)
49	code_value = blankless_line[i1 : i2];
50	i1 = blankless_line.find('codeSystem="')
51	i1 = i1 + len('codeSystem="')
52	i2 = blankless_line.find('"', i1)
53	code_system = blankless_line[i1 : i2]
54
55	return code_system + ':' + code_value
56
57	################################################################################
58	# Read the CSV file, which is exported from Appendix G of the 2011 Downloadable
59	# Resources table. This file defines the medical vocabulary codes that may be
60	# used by the PROCEDURES section. File format:
61	#
62	# - Column 1 is the symbolic code set name
63	# - Column 2 is the HL7 OID (object identifier number) for the code set
64	# - Column 3 is the actual code value
65	#
66	# The symbolic code set name is defined by Appendix G using terminology such as
67	# C4 to mean CPT-4, SNM to mean SNOMED-CT, LN to mean LOINC, and so on. The
68	# VistA CCD output uses different symbolic names (for example, CPT-4 instead of
69	# C4). The CMS validation rules do not validate symbolic code set names.
70	#
71	# Since there is no code set naming contract between VistA CCD output and this
72	# program, it would be unwise to use observed values - so we will not check the
73	# symbolic code name attributes.
74	#
75	# Per CMS 2011 rules, we must check the code set OID and the associated code
76	# value. A dictionary is an easy way to manage the OID/value information.
77	# The key will be OID + ':' + CodeValue, and the value will be CodeValue.
78	################################################################################
79	def create_code_dictionary(csvfilename):
80	csvfile = open(csvfilename, 'r')
81	codedict = {}
82	try:
83	reader = csv.reader(csvfile)
84	for row in reader:
85	key = row[1] + ":" + row[2]
86	codedict[key] = row[2]
87	finally:
88	csvfile.close()
89
90	return codedict
91
92	################################################################################
93	# Do a pattern match on the supplied line using the supplied patterns. Both
94	# the line and patterns are processed (local copies) to remove all blanks. This
95	# is necessary because XML attributes inside an element have arbitrary spacing
96	# rules. For example,
97	#
98	# <element attr = "value"...
99	# <element attr = "value"...
100	# <element attr="value"
101	#
102	# Are all valid XML. Removing all whitespace makes sane pattern matching
103	# possible.
104	################################################################################
105	def match(line, pat1, pat2="", pat3="", pat4=""):
106	blankless_line = line.replace(' ', '')
107	match_list = [pat1.replace(' ', '')]
108	if len(pat2) > 0:
109	match_list.append(pat2.replace(' ', ''))
110	if len(pat3) > 0:
111	match_list.append(pat3.replace(' ', ''))
112	if len(pat4) > 0:
113	match_list.append(pat4.replace(' ', ''))
114
115	for pattern in match_list:
116	if blankless_line.find(pattern) < 0:
117	return 0
118
119	return 1
120
121	################################################################################
122	# Find the start and end of the PROCEDURES section. According to the CMS 2011
123	# specs, the structure for PROCEDURES is:
124	#
125	# <component>
126	# <section>
127	# <templateId root="2.16.840.1.113883.10.20.1.12"/>
128	# <code code="47519-4" codeSystem="2.16.840.1.113883.6.1"/>
129	# ...various PROCEDURES elements
130	# </section>
131	# </component>
132	#
133	# Note that there can be multiple templateId elements. However, the one given
134	# above (2.16.840.1.113883.10.20.1.12) is mandatory and serves (in conjuction
135	# with the code element) to identify the PROCEDURES section.
136	#
137	################################################################################
138	def find_procedures_section(source):
139	start_index = 0
140	searching = 1
141	end_index = len(source) - 1
142	min_procedures_len = 6
143
144	while searching:
145	# If within minimum target size at the end, give up
146	if start_index + min_procedures_len >= end_index:
147	break
148
149	# No <component> element means no PROCEDURES. Quit searching.
150	patterns = ["<component>"]
151	component_start = find_line(start_index, end_index, patterns)
152	if component_start < 0:
153	break
154
155	# Missing <component> end means no PROCEDURES (really malformed XML).
156	# However, if have <component></component> on same line, then it's
157	# an empty component, so keep searching.
158	patterns = ["</component>"]
159	component_end = find_line(component_start, end_index, patterns)
160	if component_end < 0:
161	break
162	if component_start == component_end:
163	start_index = component_start + 1
164	continue
165
166	# Search for <section> element bounded by component. If not found,
167	# continue search after component end.
168	patterns = ["<section>"]
169	section_start = find_line(component_start+1, component_end, patterns)
170	if section_start < 0:
171	start_index = component_end + 1
172	continue
173
174	# Mission <section> end means malformed XML inside this <component>.
175	# Recover by continuing search after <component> end. Also continue
176	# search if <section></section> on same line.
177	patterns = ["</section>"]
178	section_end = find_line(section_start, component_end, patterns)
179	if section_end < 0 or section_start == section_end:
180	start_index = component_end + 1
181	continue
182
183	patterns = ["<templateId", 'root="2.16.840.1.113883.10.20.1.12"']
184	template_index = find_line(section_start+1, section_end, patterns)
185	if template_index < 0:
186	start_index = component_end + 1
187	continue
188
189	patterns = ["<code", \
190	'code="47519-4"', \
191	'codeSystem="2.16.840.1.113883.6.1"']
192	code_index = find_line(template_index+1, section_end, patterns)
193	if code_index > 0:
194	return (component_start, component_end)
195
196	return (-1, -1)
197
198	################################################################################
199	# Entry point: check command arguments and do the work
200	################################################################################
201	# Check for proper usage
202	if len(sys.argv) != 3:
203	print('Usage: ' + sys.argv[0] + \
204	' codelistcsvfile xmlfile')
205	sys.exit(1)
206
207	# Create the medical code dictionary using supplied Appendix G data
208	code_dictionary = create_code_dictionary(sys.argv[1])
209
210	# Now read in the quasi-XML source file into a list.
211
212	infile = open(sys.argv[2], 'r')
213	source_lines = infile.readlines()
214	infile.close()
215	print("Processing ", sys.argv[2])
216
217	# Open the output file and start processing the input. First, we're looking
218	# for the start of the PROCEDURES section. According to the CMS 2011 specs,
219	# the structure for PROCEDURES is:
220	#
221	# <component>
222	# <section>
223	# <templateId root="2.16.840.1.113883.10.20.1.12"/>
224	# <code code="47519-4" codeSystem="2.16.840.1.113883.6.1"/>
225	# ...various PROCEDURES elements
226	# </section>
227	# </component>
228	#
229	# Note that there can be multiple templateId elements. However, the one given
230	# above (2.16.840.1.113883.10.20.1.12) is mandatory and serves (in conjuction
231	# with the code element) to identify the PROCEDURES section.
232
233	found_procedures = 0
234	kill_procedures_text = 0
235	finding_entry = 0
236	found_entry = 0
237	finding_procedures = 0
238
239	# Process lines.
240	line_index = -1
241	procedures_start_index = -1
242	procedures_end_index = -1
243	entry_start_index = 0
244	valid_entry_count = 0
245
246	# Iterate over the source file lines
247	for line in source_lines[:]:
248	line_index = line_index + 1
249
250	# Look for the start of the PROCEDURES section
251	if match(line, "<component>"):
252	finding_procedures = 1
253	procedures_start_index = line_index
254	elif finding_procedures and match(line, "<section>"):
255	finding_procedures = 1
256	elif finding_procedures and match(line, \
257	"<templateId", \
258	'root="2.16.840.1.113883.10.20.1.12"'):
259	finding_procedures = 1
260	elif finding_procedures and match(line, \
261	"<code", \
262	'code="47519-4"', \
263	'codeSystem="2.16.840.1.113883.6.1"'):
264	finding_procedures = 0
265	found_procedures = 1
266	kill_procedures_text = 1
267	finding_entry = 0
268	found_entry = 0
269	else:
270	finding_procedures = 0
271
272	if not found_procedures:
273	continue
274
275	# Look for end of the PROCEDURES section, which stops all special processing
276	if match(line, "</component>") and found_procedures:
277	if valid_entry_count == 0:
278	for n in range(procedures_start_index, line_index+1):
279	source_lines[n] = ""
280	break
281
282	# Replace a real <text> element with one that is empty. Only do this once
283	# and in the PROCEDURES header.
284	if kill_procedures_text and match(line, "<text>"):
285	text_start_index = line_index + 1
286	continue
287
288	if kill_procedures_text and match(line, "</text>"):
289	for n in range(text_start_index, line_index):
290	source_lines[n] = ""
291	kill_procedures_text = 0
292	continue
293
294	# Look for an entry element inside the PROCEDURES section. Looking for:
295	# <entry typeCode="DRIV">
296	# <procedure classCode="PROC" moodCode="EVN">
297	# <templateId root="2.16.840.1.113883.10.20.1.29" ... />
298	# ... (multiple templateId elements are permitted) ...
299	# <id root="PROCEDURE_4141_10" extension="CCRObjectID"/>
300	# <code displayName="..." code="99242"
301	# codeSystemName="CPT-4" codeSystem="2.16.840.1.113883.6.12"/>
302
303	if match(line, '<entry typeCode="DRIV">'):
304	finding_entry = 1
305	entry_start_index = line_index
306	elif finding_entry and \
307	match(line, '<procedure classCode="PROC"', 'moodCode="EVN">'):
308	finding_entry = 1
309	elif finding_entry and \
310	match(line, '<templateId root="2.16.840.1.113883.10.20.1.29"'):
311	finding_entry = 1
312	elif finding_entry and match(line, '<code'):
313	code_key = build_code_key(line)
314	finding_entry = 0
315	found_entry = 1
316
317	# If at end of the procedure entry, validate against the codes. Output if
318	# valid using the set-aside entry list. Otherwise output an empty line.
319	if found_entry and match(line, "</entry>"):
320	if code_key in code_dictionary:
321	valid_entry_count = valid_entry_count + 1
322	else:
323	for n in range(entry_start_index, line_index+1):
324	source_lines[n] = ""
325	print(" Deleting procedure entry with invalid code ", code_key)
326	finding_entry = 0
327	found_entry = 0
328
329	# Output the modified source line array
330	outfile = open(sys.argv[2], 'wt')
331	for line in source_lines:
332	if len(line) > 0:
333	outfile.write(line)
334	print(" Processing complete, valid entry count: ", valid_entry_count)
335	if valid_entry_count == 0:
336	print(" Entire procedure section has been deleted")
337	outfile.close()
338	sys.exit(0)
339
340
341
342
343
344
345
346
347
348
349
350
351

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: ccr2ccd-xslt/trunk/make/ProcFix.py@ 1482

Download in other formats: