################################################################################ # # Process the PROCEDURES section of an intermediate CCR Level 3 XML file using # a set of rules. # # Usage: ProcFix codelistcsvfile inputxmlfile outputxmlfile # # Where: # codelistcsvfile: defines valid medical vocabulary codes for PROCEDURES # inputxmlfile: input file # outputxmlfile: output file (can be same name as input file) # # The process of producing quality measures XML for submission to CMS (as part # of Meaningful Use certification) is quite complex. Briefly, the steps are: # # 1. The VistA EHR outputs a CCR record (XML). # # 2. The CCR is transformed to CCD Level 3 format (XML) using XSLT. # # 3. Due to data vagaries (desire to limit patient-identifying information and # new Schematron validation directives introduced for 2011), additional # processing is necessary to output a final XML file for submission to CMS. # # A series of sed, awk, and other (such as this program) commands are # applied to slowly move the baseline CCD into a final format that will # successfully validate with CMS. Some of the intermediate "XML" files # produced during this multi-step transformation may not be well-formed, # which makes it difficult to use tools such as XSLT. # # This program only assumes that the XML for the PROCEDURES section will be # well-formed. Other sections of the input file may not be valid XML, but that # will not adversely affect program operation. # ################################################################################ import sys import csv ################################################################################ # This function extracts the code value and OID from the supplied line and # builds a code dictionary key of the form "OID:value" ################################################################################ def build_code_key(line): # Extract code value and OID blankless_line = line.replace(' ', '') i1 = blankless_line.find('code="') i1 = i1 + len('code="'); i2 = blankless_line.find('"', i1) code_value = blankless_line[i1 : i2]; i1 = blankless_line.find('codeSystem="') i1 = i1 + len('codeSystem="') i2 = blankless_line.find('"', i1) code_system = blankless_line[i1 : i2] return code_system + ':' + code_value ################################################################################ # Read the CSV file, which is exported from Appendix G of the 2011 Downloadable # Resources table. This file defines the medical vocabulary codes that may be # used by the PROCEDURES section. File format: # # - Column 1 is the symbolic code set name # - Column 2 is the HL7 OID (object identifier number) for the code set # - Column 3 is the actual code value # # The symbolic code set name is defined by Appendix G using terminology such as # C4 to mean CPT-4, SNM to mean SNOMED-CT, LN to mean LOINC, and so on. The # VistA CCD output uses different symbolic names (for example, CPT-4 instead of # C4). The CMS validation rules do not validate symbolic code set names. # # Since there is no code set naming contract between VistA CCD output and this # program, it would be unwise to use observed values - so we will not check the # symbolic code name attributes. # # Per CMS 2011 rules, we must check the code set OID and the associated code # value. A dictionary is an easy way to manage the OID/value information. # The key will be OID + ':' + CodeValue, and the value will be CodeValue. ################################################################################ def create_code_dictionary(csvfilename): csvfile = open(csvfilename, 'r') codedict = {} try: reader = csv.reader(csvfile) for row in reader: key = row[1] + ":" + row[2] codedict[key] = row[2] finally: csvfile.close() return codedict ################################################################################ # Do a pattern match on the supplied line using the supplied patterns. Both # the line and patterns are processed (local copies) to remove all blanks. This # is necessary because XML attributes inside an element have arbitrary spacing # rules. For example, # # 0: match_list.append(pat2.replace(' ', '')) if len(pat3) > 0: match_list.append(pat3.replace(' ', '')) if len(pat4) > 0: match_list.append(pat4.replace(' ', '')) for pattern in match_list: if blankless_line.find(pattern) < 0: return 0 return 1 ################################################################################ # Find the start and end of the PROCEDURES section. According to the CMS 2011 # specs, the structure for PROCEDURES is: # # #
# # # ...various PROCEDURES elements #
#
# # Note that there can be multiple templateId elements. However, the one given # above (2.16.840.1.113883.10.20.1.12) is mandatory and serves (in conjuction # with the code element) to identify the PROCEDURES section. # ################################################################################ def find_procedures_section(source): start_index = 0 searching = 1 end_index = len(source) - 1 min_procedures_len = 6 while searching: # If within minimum target size at the end, give up if start_index + min_procedures_len >= end_index: break # No element means no PROCEDURES. Quit searching. patterns = [""] component_start = find_line(start_index, end_index, patterns) if component_start < 0: break # Missing end means no PROCEDURES (really malformed XML). # However, if have on same line, then it's # an empty component, so keep searching. patterns = [""] component_end = find_line(component_start, end_index, patterns) if component_end < 0: break if component_start == component_end: start_index = component_start + 1 continue # Search for
element bounded by component. If not found, # continue search after component end. patterns = ["
"] section_start = find_line(component_start+1, component_end, patterns) if section_start < 0: start_index = component_end + 1 continue # Mission
end means malformed XML inside this . # Recover by continuing search after end. Also continue # search if
on same line. patterns = ["
"] section_end = find_line(section_start, component_end, patterns) if section_end < 0 or section_start == section_end: start_index = component_end + 1 continue patterns = [" 0: return (component_start, component_end) return (-1, -1) ################################################################################ # Entry point: check command arguments and do the work ################################################################################ # Check for proper usage if len(sys.argv) != 3: print('Usage: ' + sys.argv[0] + \ ' codelistcsvfile xmlfile') sys.exit(1) # Create the medical code dictionary using supplied Appendix G data code_dictionary = create_code_dictionary(sys.argv[1]) # Now read in the quasi-XML source file into a list. infile = open(sys.argv[2], 'r') source_lines = infile.readlines() infile.close() print("Processing ", sys.argv[2]) # Open the output file and start processing the input. First, we're looking # for the start of the PROCEDURES section. According to the CMS 2011 specs, # the structure for PROCEDURES is: # # #
# # # ...various PROCEDURES elements #
#
# # Note that there can be multiple templateId elements. However, the one given # above (2.16.840.1.113883.10.20.1.12) is mandatory and serves (in conjuction # with the code element) to identify the PROCEDURES section. found_procedures = 0 kill_procedures_text = 0 finding_entry = 0 found_entry = 0 finding_procedures = 0 # Process lines. line_index = -1 procedures_start_index = -1 procedures_end_index = -1 entry_start_index = 0 valid_entry_count = 0 # Iterate over the source file lines for line in source_lines[:]: line_index = line_index + 1 # Look for the start of the PROCEDURES section if match(line, ""): finding_procedures = 1 procedures_start_index = line_index elif finding_procedures and match(line, "
"): finding_procedures = 1 elif finding_procedures and match(line, \ "") and found_procedures: if valid_entry_count == 0: for n in range(procedures_start_index, line_index+1): source_lines[n] = "" break # Replace a real element with one that is empty. Only do this once # and in the PROCEDURES header. if kill_procedures_text and match(line, ""): text_start_index = line_index + 1 continue if kill_procedures_text and match(line, ""): for n in range(text_start_index, line_index): source_lines[n] = "" kill_procedures_text = 0 continue # Look for an entry element inside the PROCEDURES section. Looking for: # # # # ... (multiple templateId elements are permitted) ... # # if match(line, ''): finding_entry = 1 entry_start_index = line_index elif finding_entry and \ match(line, ''): finding_entry = 1 elif finding_entry and \ match(line, '"): if code_key in code_dictionary: valid_entry_count = valid_entry_count + 1 else: for n in range(entry_start_index, line_index+1): source_lines[n] = "" print(" Deleting procedure entry with invalid code ", code_key) finding_entry = 0 found_entry = 0 # Output the modified source line array outfile = open(sys.argv[2], 'wt') for line in source_lines: if len(line) > 0: outfile.write(line) print(" Processing complete, valid entry count: ", valid_entry_count) if valid_entry_count == 0: print(" Entire procedure section has been deleted") outfile.close() sys.exit(0)