################################################################################
# 
# Process the PROCEDURES section of an intermediate CCR Level 3 XML file using
# a set of rules.
#
# Usage: ProcFix codelistcsvfile inputxmlfile outputxmlfile
#
# Where:
#     codelistcsvfile: defines valid medical vocabulary codes for PROCEDURES
#     inputxmlfile:    input file
#     outputxmlfile:   output file (can be same name as input file)
#
# The process of producing quality measures XML for submission to CMS (as part
# of Meaningful Use certification) is quite complex.  Briefly, the steps are:
#
# 1. The VistA EHR outputs a CCR record (XML).
#
# 2. The CCR is transformed to CCD Level 3 format (XML) using XSLT.
#
# 3. Due to data vagaries (desire to limit patient-identifying information and
#    new Schematron validation directives introduced for 2011), additional
#    processing is necessary to output a final XML file for submission to CMS.
#
#    A series of sed, awk, and other (such as this program) commands are
#    applied to slowly move the baseline CCD into a final format that will
#    successfully validate with CMS.  Some of the intermediate "XML" files
#    produced during this multi-step transformation may not be well-formed,
#    which makes it difficult to use tools such as XSLT.
#
# This program only assumes that the XML for the PROCEDURES section will be
# well-formed.  Other sections of the input file may not be valid XML, but that
#  will not adversely affect program operation.
#
################################################################################

import sys
import csv

################################################################################
# This function extracts the code value and OID from the supplied line and
# builds a code dictionary key of the form "OID:value"
################################################################################
def build_code_key(line):
    # Extract code value and OID
    blankless_line = line.replace(' ', '')
    i1 = blankless_line.find('code="')
    i1 = i1 + len('code="');
    i2 = blankless_line.find('"', i1)
    code_value = blankless_line[i1 : i2];
    i1 = blankless_line.find('codeSystem="')
    i1 = i1 + len('codeSystem="')
    i2 = blankless_line.find('"', i1)
    code_system = blankless_line[i1 : i2]

    return code_system + ':' + code_value
        
################################################################################
# Read the CSV file, which is exported from Appendix G of the 2011 Downloadable
# Resources table.  This file defines the medical vocabulary codes that may be
# used by the PROCEDURES section.  File format:
#
#     - Column 1 is the symbolic code set name
#     - Column 2 is the HL7 OID (object identifier number) for the code set
#     - Column 3 is the actual code value
#
# The symbolic code set name is defined by Appendix G using terminology such as
# C4 to mean CPT-4, SNM to mean SNOMED-CT, LN to mean LOINC, and so on.  The
# VistA CCD output uses different symbolic names (for example, CPT-4 instead of
# C4).  The CMS validation rules do not validate symbolic code set names.
#
# Since there is no code set naming contract between VistA CCD output and this
# program, it would be unwise to use observed values - so we will not check the
# symbolic code name attributes.
#
# Per CMS 2011 rules, we must check the code set OID and the associated code
# value.  A dictionary is an easy way to manage the OID/value information.
# The key will be OID + ':' + CodeValue, and the value will be CodeValue.
################################################################################
def create_code_dictionary(csvfilename):
    csvfile = open(csvfilename, 'r')
    codedict = {}
    try:
        reader = csv.reader(csvfile)
        for row in reader:
            key = row[1] + ":" + row[2]
            codedict[key] = row[2]
    finally:
        csvfile.close()
        
    return codedict

################################################################################
# Do a pattern match on the supplied line using the supplied patterns.  Both
# the line and patterns are processed (local copies) to remove all blanks.  This
# is necessary because XML attributes inside an element have arbitrary spacing
# rules.  For example,
#
#    <element attr = "value"...
#    <element   attr = "value"...
#    <element attr="value"
#
# Are all valid XML.  Removing all whitespace makes sane pattern matching
# possible.
################################################################################
def match(line, pat1, pat2="", pat3="", pat4=""):
    blankless_line = line.replace(' ', '')
    match_list = [pat1.replace(' ', '')]
    if len(pat2) > 0:
        match_list.append(pat2.replace(' ', ''))
    if len(pat3) > 0:
        match_list.append(pat3.replace(' ', ''))
    if len(pat4) > 0:
        match_list.append(pat4.replace(' ', ''))

    for pattern in match_list:
        if blankless_line.find(pattern) < 0:
            return 0

    return 1

################################################################################
# Find the start and end of the PROCEDURES section.  According to the CMS 2011
# specs, the structure for PROCEDURES is:
#
#     <component>
#         <section>
#             <templateId root="2.16.840.1.113883.10.20.1.12"/>
#             <code code="47519-4" codeSystem="2.16.840.1.113883.6.1"/>
#             ...various PROCEDURES elements
#         </section>
#     </component>
#
# Note that there can be multiple templateId elements.  However, the one given
# above (2.16.840.1.113883.10.20.1.12) is mandatory and serves (in conjuction
# with the code element) to identify the PROCEDURES section.
#
################################################################################
def find_procedures_section(source):
    start_index = 0
    searching = 1
    end_index = len(source) - 1
    min_procedures_len = 6
    
    while searching:
        # If within minimum target size at the end, give up
        if start_index + min_procedures_len >= end_index:
            break

        # No <component> element means no PROCEDURES.  Quit searching.
        patterns = ["<component>"]
        component_start = find_line(start_index, end_index, patterns)
        if component_start < 0:
            break

        # Missing <component> end means no PROCEDURES (really malformed XML).
        # However, if have <component></component> on same line, then it's
        # an empty component, so keep searching.
        patterns = ["</component>"]
        component_end = find_line(component_start, end_index, patterns)
        if component_end < 0:
            break
        if component_start == component_end:
            start_index = component_start + 1
            continue

        # Search for <section> element bounded by component.  If not found,
        # continue search after component end.
        patterns = ["<section>"]
        section_start = find_line(component_start+1, component_end, patterns)
        if section_start < 0:
            start_index = component_end + 1
            continue

        # Mission <section> end means malformed XML inside this <component>.
        # Recover by continuing search after <component> end.  Also continue
        # search if <section></section> on same line.
        patterns = ["</section>"]
        section_end = find_line(section_start, component_end, patterns)
        if section_end < 0 or section_start == section_end:
            start_index = component_end + 1
            continue

        patterns = ["<templateId", 'root="2.16.840.1.113883.10.20.1.12"']
        template_index = find_line(section_start+1, section_end, patterns)
        if template_index < 0:
            start_index = component_end + 1
            continue
        
        patterns = ["<code", \
                    'code="47519-4"', \
                    'codeSystem="2.16.840.1.113883.6.1"']
        code_index = find_line(template_index+1, section_end, patterns)
        if code_index > 0:
            return (component_start, component_end)

    return (-1, -1)

################################################################################
# Entry point: check command arguments and do the work
################################################################################
# Check for proper usage
if len(sys.argv) != 3:
    print('Usage: ' + sys.argv[0] + \
          ' codelistcsvfile xmlfile')
    sys.exit(1)

# Create the medical code dictionary using supplied Appendix G data
code_dictionary = create_code_dictionary(sys.argv[1])

# Now read in the quasi-XML source file into a list.

infile = open(sys.argv[2], 'r')
source_lines = infile.readlines()
infile.close()
print("Processing ", sys.argv[2])

# Open the output file and start processing the input.  First, we're looking
# for the start of the PROCEDURES section.  According to the CMS 2011 specs,
# the structure for PROCEDURES is:
#
#     <component>
#         <section>
#             <templateId root="2.16.840.1.113883.10.20.1.12"/>
#             <code code="47519-4" codeSystem="2.16.840.1.113883.6.1"/>
#             ...various PROCEDURES elements
#         </section>
#     </component>
#
# Note that there can be multiple templateId elements.  However, the one given
# above (2.16.840.1.113883.10.20.1.12) is mandatory and serves (in conjuction
# with the code element) to identify the PROCEDURES section.

found_procedures = 0
kill_procedures_text = 0
finding_entry = 0
found_entry = 0
finding_procedures = 0

# Process lines.
line_index = -1
procedures_start_index = -1
procedures_end_index = -1
entry_start_index = 0
valid_entry_count = 0

# Iterate over the source file lines
for line in source_lines[:]:
    line_index = line_index + 1
    
    # Look for the start of the PROCEDURES section
    if  match(line, "<component>"):
        finding_procedures = 1
        procedures_start_index = line_index
    elif finding_procedures and match(line, "<section>"):
        finding_procedures = 1
    elif finding_procedures and match(line, \
                                      "<templateId", \
                                      'root="2.16.840.1.113883.10.20.1.12"'):
        finding_procedures = 1
    elif finding_procedures and match(line, \
                                      "<code", \
                                      'code="47519-4"', \
                                      'codeSystem="2.16.840.1.113883.6.1"'):
        finding_procedures = 0
        found_procedures = 1
        kill_procedures_text = 1
        finding_entry = 0
        found_entry = 0
    else:
        finding_procedures = 0

    if not found_procedures:
        continue

    # Look for end of the PROCEDURES section, which stops all special processing
    if match(line, "</component>") and found_procedures:
        if valid_entry_count == 0:
            for n in range(procedures_start_index, line_index+1):
                source_lines[n] = ""
        break
        
    # Replace a real <text> element with one that is empty.  Only do this once
    # and in the PROCEDURES header.
    if kill_procedures_text and match(line, "<text>"):
        text_start_index = line_index + 1
        continue
    
    if kill_procedures_text and match(line, "</text>"):
        for n in range(text_start_index, line_index):
            source_lines[n] = ""
        kill_procedures_text = 0
        continue

    # Look for an entry element inside the PROCEDURES section.  Looking for:
    #     <entry typeCode="DRIV">
    #         <procedure classCode="PROC" moodCode="EVN">
    #         <templateId root="2.16.840.1.113883.10.20.1.29" ... />
    #         ... (multiple templateId elements are permitted) ...
    #         <id root="PROCEDURE_4141_10" extension="CCRObjectID"/>
    #         <code displayName="..." code="99242"
    #               codeSystemName="CPT-4" codeSystem="2.16.840.1.113883.6.12"/>

    if match(line, '<entry typeCode="DRIV">'):
        finding_entry = 1
        entry_start_index = line_index
    elif finding_entry and \
           match(line, '<procedure classCode="PROC"', 'moodCode="EVN">'):
        finding_entry = 1
    elif finding_entry and \
       match(line, '<templateId root="2.16.840.1.113883.10.20.1.29"'):
        finding_entry = 1
    elif finding_entry and match(line, '<code'):
        code_key = build_code_key(line)
        finding_entry = 0
        found_entry = 1

    # If at end of the procedure entry, validate against the codes.  Output if
    # valid using the set-aside entry list.  Otherwise output an empty line.
    if found_entry and match(line, "</entry>"):
        if code_key in code_dictionary:
            valid_entry_count = valid_entry_count + 1
        else:
            for n in range(entry_start_index, line_index+1):
                source_lines[n] = ""
            print("    Deleting procedure entry with invalid code ", code_key)
        finding_entry = 0
        found_entry = 0

# Output the modified source line array
outfile = open(sys.argv[2], 'wt')
for line in source_lines:
    if len(line) > 0:
        outfile.write(line)
print("    Processing complete, valid entry count: ", valid_entry_count)
if valid_entry_count == 0:
    print("    Entire procedure section has been deleted")
outfile.close()
sys.exit(0)