Extract Matched Records from XML

Hi All,

I have a requirement to extract para in XML file on the basis of another list file having specific parameters.
I will extract these para from XML and import in one scheduler tool.
file2

<FOLDER DATACENTER="ControlMserver" VERSION="800" PLATFORM="UNIX" FOLDER_NAME="SH_AP_INT_B01" MODIFIED="False" LAST_UPLOAD="20151202132638UTC" REAL_FOLDER_ID="193" TYPE="1" USED_BY_CODE="0">
        <JOB JOBISN="1" APPLICATION="SH_AP_INT_B01" SUB_APPLICATION="SH_AP_INT_B01" MEMNAME="ods_script_etl_wrapper.ksh" JOBNAME="JB_AP_ASSG_PRCE_PLAN_INT_B01" CREATED_BY="xyz" RUN_AS="abc" CRITICAL="0" TASKTYPE="Job" CYCLIC="0" NODEID="ser455" INTERVAL="00001M" MEMLIB="/c/bin/" CONFIRM="0" RETRO="0" MAXWAIT="0" MAXRERUN="0" AUTOARCH="1" MAXDAYS="0" MAXRUNS="0" DAYS="ALL" JAN="1" FEB="1" MAR="1" APR="1" MAY="1" JUN="1" JUL="1" AUG="1" SEP="1" OCT="1" NOV="1" DEC="1" DAYS_AND_OR="O" SHIFT="Ignore Job" SHIFTNUM="+00" SYSDB="1" IND_CYCLIC="S" CREATION_USER="emuser" CREATION_DATE="20151120" CREATION_TIME="160829" CHANGE_USERID="user" CHANGE_DATE="20151202" CHANGE_TIME="185636" RULE_BASED_CALENDAR_RELATIONSHIP="O" APPL_TYPE="OS" MULTY_AGENT="N" USE_INSTREAM_JCL="N" VERSION_OPCODE="N" IS_CURRENT_VERSION="Y" VERSION_SERIAL="3" VERSION_HOST="PIGAUTAM02" CYCLIC_TOLERANCE="0" CYCLIC_TYPE="C" PARENT_FOLDER="SH_AP_INT_B01">
            <VARIABLE NAME="%%PARM1" VALUE="wf_AP_INT_ASSG_PRCE_PLAN_INT_INS" />
            <VARIABLE NAME="%%PARM2" VALUE="ETL" />
            <INCOND NAME="SH_CYCLE_AUDIT_START_B01-OK" ODATE="ODAT" AND_OR="A" />
            <INCOND NAME="JB_PRD_PRDCTLGREL_REF_B01-OK" ODATE="ODAT" AND_OR="A" />
            <OUTCOND NAME="JB_AP_ASSG_PRCE_PLAN_INT_B01-OK" ODATE="ODAT" SIGN="+" />
        </JOB>
        <JOB JOBISN="2" APPLICATION="SH_AP_INT_B01" SUB_APPLICATION="SH_AP_INT_B01" MEMNAME="ods_script_etl_wrapper.ksh" JOBNAME="JB_AP_ASSG_PROD_INT_B01" CREATED_BY="xyz" RUN_AS="abc" CRITICAL="0" TASKTYPE="Job" CYCLIC="0" NODEID="ser455" INTERVAL="00001M" MEMLIB="/c/bin/" CONFIRM="0" RETRO="0" MAXWAIT="0" MAXRERUN="0" AUTOARCH="1" MAXDAYS="0" MAXRUNS="0" DAYS="ALL" JAN="1" FEB="1" MAR="1" APR="1" MAY="1" JUN="1" JUL="1" AUG="1" SEP="1" OCT="1" NOV="1" DEC="1" DAYS_AND_OR="O" SHIFT="Ignore Job" SHIFTNUM="+00" SYSDB="1" IND_CYCLIC="S" CREATION_USER="emuser" CREATION_DATE="20151120" CREATION_TIME="160829" CHANGE_USERID="user" CHANGE_DATE="20151202" CHANGE_TIME="185636" RULE_BASED_CALENDAR_RELATIONSHIP="O" APPL_TYPE="OS" MULTY_AGENT="N" USE_INSTREAM_JCL="N" VERSION_OPCODE="N" IS_CURRENT_VERSION="Y" VERSION_SERIAL="3" VERSION_HOST="PIGAUTAM02" CYCLIC_TOLERANCE="0" CYCLIC_TYPE="C" PARENT_FOLDER="SH_AP_INT_B01">
            <VARIABLE NAME="%%PARM1" VALUE="merge_ap_assg_prod_int.sql" />
            <VARIABLE NAME="%%PARM2" VALUE="SQL" />
            <INCOND NAME="SH_CYCLE_AUDIT_START_B01-OK" ODATE="ODAT" AND_OR="A" />
            <INCOND NAME="JB_PRODCTLG_SH_PRD_REF_B01-OK" ODATE="ODAT" AND_OR="A" />
            <INCOND NAME="JB_AP_TBCMMT_TERM_LKP_B01-OK" ODATE="ODAT" AND_OR="A" />
            <OUTCOND NAME="JB_AP_ASSG_PROD_INT_B01-OK" ODATE="ODAT" SIGN="+" />
        </JOB>
        <JOB JOBISN="3" APPLICATION="SH_AP_INT_B01" SUB_APPLICATION="SH_AP_INT_B01" MEMNAME="ods_script_etl_wrapper.ksh" JOBNAME="SH_AP_INT_B01_END" CREATED_BY="xyz" RUN_AS="abc" CRITICAL="0" TASKTYPE="Dummy" CYCLIC="0" NODEID="ser455" INTERVAL="00001M" MEMLIB="/c/bin/" CONFIRM="0" RETRO="0" MAXWAIT="0" MAXRERUN="0" AUTOARCH="1" MAXDAYS="0" MAXRUNS="0" DAYS="ALL" JAN="1" FEB="1" MAR="1" APR="1" MAY="1" JUN="1" JUL="1" AUG="1" SEP="1" OCT="1" NOV="1" DEC="1" DAYS_AND_OR="O" SHIFT="Ignore Job" SHIFTNUM="+00" SYSDB="1" IND_CYCLIC="S" CREATION_USER="emuser" CREATION_DATE="20151120" CREATION_TIME="160829" CHANGE_USERID="user" CHANGE_DATE="20151202" CHANGE_TIME="185636" RULE_BASED_CALENDAR_RELATIONSHIP="O" APPL_TYPE="OS" MULTY_AGENT="N" USE_INSTREAM_JCL="N" VERSION_OPCODE="N" IS_CURRENT_VERSION="Y" VERSION_SERIAL="3" VERSION_HOST="PIGAUTAM02" CYCLIC_TOLERANCE="0" CYCLIC_TYPE="C" PARENT_FOLDER="SH_AP_INT_B01">
            <VARIABLE NAME="%%PARM1" VALUE="merge_ap_assg_prod_int.sql" />
            <VARIABLE NAME="%%PARM2" VALUE="SQL" />
            <VARIABLE NAME="%%$BMCWAIORIGTYPE" VALUE="Job" />
            <INCOND NAME="SH_CYCLE_AUDIT_START_B01-OK" ODATE="ODAT" AND_OR="A" />
            <INCOND NAME="JB_AP_ASSG_PRCE_PLAN_INT_B01-OK" ODATE="ODAT" AND_OR="A" />
            <INCOND NAME="JB_AP_ASSG_PROD_INT_B01-OK" ODATE="ODAT" AND_OR="A" />
            <OUTCOND NAME="SH_AP_INT_B01_END-OK" ODATE="ODAT" SIGN="+" />
        </JOB>
    </FOLDER>
	<FOLDER DATACENTER="ControlMserver" VERSION="800" PLATFORM="UNIX" FOLDER_NAME="SH_AP_RI_B01" MODIFIED="False" LAST_UPLOAD="20151203033754UTC" REAL_FOLDER_ID="194" TYPE="1" USED_BY_CODE="0">
        <JOB JOBISN="1" APPLICATION="SH_AP_RI_B01" SUB_APPLICATION="SH_AP_RI_B01" MEMNAME="ods_script_etl_wrapper.ksh" JOBNAME="JB_AP_CUST_KEY_ASSG_PROD_RI_B01" CREATED_BY="xyz" RUN_AS="abc" CRITICAL="0" TASKTYPE="Job" CYCLIC="0" INTERVAL="00001M" MEMLIB="/c/bin/" CONFIRM="0" RETRO="0" MAXWAIT="0" MAXRERUN="0" AUTOARCH="1" MAXDAYS="0" MAXRUNS="0" DAYS="ALL" JAN="1" FEB="1" MAR="1" APR="1" MAY="1" JUN="1" JUL="1" AUG="1" SEP="1" OCT="1" NOV="1" DEC="1" DAYS_AND_OR="O" SHIFT="Ignore Job" SHIFTNUM="+00" SYSDB="1" IND_CYCLIC="S" CREATION_USER="emuser" CREATION_DATE="20151120" CREATION_TIME="160829" CHANGE_USERID="user" CHANGE_DATE="20151203" CHANGE_TIME="090753" RULE_BASED_CALENDAR_RELATIONSHIP="O" APPL_TYPE="OS" MULTY_AGENT="N" USE_INSTREAM_JCL="N" VERSION_OPCODE="N" IS_CURRENT_VERSION="Y" VERSION_SERIAL="4" VERSION_HOST="PIGAUTAM02" CYCLIC_TOLERANCE="0" CYCLIC_TYPE="C" PARENT_FOLDER="SH_AP_RI_B01">
            <VARIABLE NAME="%%PARM1" VALUE="ri_ap_cust_key_assg_prod.sql" />
            <VARIABLE NAME="%%PARM2" VALUE="SQL" />
            <INCOND NAME="SH_CYCLE_AUDIT_START_B01-OK" ODATE="ODAT" AND_OR="A" />
            <INCOND NAME="JB_AP_ASSG_PROD_INT_B01-OK" ODATE="ODAT" AND_OR="A" />
            <OUTCOND NAME="JB_AP_CUST_KEY_ASSG_PROD_RI_B01-OK" ODATE="ODAT" SIGN="+" />
        </JOB>
	</FOLDER>

In above xml data, <FOLDER will create new Folder having name in variable like FOLDER_NAME="SH_AP_INT_B01" , within these folder there are many jobs. Each job will start in <JOB .....and end by... </JOB> .. I don't want all Jobs in xml , so I will compare it with list in another file, Only jobs those are present in another file should be in XML.
Name of each job is having variable (eg.) JOBNAME="JB_AP_ASSG_PROD_INT_B01" . List of job name is like file1 .

<JOB JOBNAME="JB_AP_ASSG_PROD_INT_B01"
<JOB JOBNAME="JB_AP_ASSG_PRCE_PLAN_INT_B01"

I tried it like below

awk 'NR==FNR{a[$1];next} $5 in a { print "<JOB"$0}' RS="<JOB" file1 file2

Since i have to take RS="<JOB", so i have added this in front of every Job name in list file
But the problem is , it is not taking Folder name in it.

Final data i want like below.

<FOLDER DATACENTER="ControlMserver" VERSION="800" PLATFORM="UNIX" FOLDER_NAME="SH_AP_INT_B01" MODIFIED="False" LAST_UPLOAD="20151202132638UTC" REAL_FOLDER_ID="193" TYPE="1" USED_BY_CODE="0">
        <JOB JOBISN="1" APPLICATION="SH_AP_INT_B01" SUB_APPLICATION="SH_AP_INT_B01" MEMNAME="ods_script_etl_wrapper.ksh" JOBNAME="JB_AP_ASSG_PRCE_PLAN_INT_B01" CREATED_BY="xyz" RUN_AS="abc" CRITICAL="0" TASKTYPE="Job" CYCLIC="0" NODEID="ser455" INTERVAL="00001M" MEMLIB="/c/bin/" CONFIRM="0" RETRO="0" MAXWAIT="0" MAXRERUN="0" AUTOARCH="1" MAXDAYS="0" MAXRUNS="0" DAYS="ALL" JAN="1" FEB="1" MAR="1" APR="1" MAY="1" JUN="1" JUL="1" AUG="1" SEP="1" OCT="1" NOV="1" DEC="1" DAYS_AND_OR="O" SHIFT="Ignore Job" SHIFTNUM="+00" SYSDB="1" IND_CYCLIC="S" CREATION_USER="emuser" CREATION_DATE="20151120" CREATION_TIME="160829" CHANGE_USERID="user" CHANGE_DATE="20151202" CHANGE_TIME="185636" RULE_BASED_CALENDAR_RELATIONSHIP="O" APPL_TYPE="OS" MULTY_AGENT="N" USE_INSTREAM_JCL="N" VERSION_OPCODE="N" IS_CURRENT_VERSION="Y" VERSION_SERIAL="3" VERSION_HOST="PIGAUTAM02" CYCLIC_TOLERANCE="0" CYCLIC_TYPE="C" PARENT_FOLDER="SH_AP_INT_B01">
            <VARIABLE NAME="%%PARM1" VALUE="wf_AP_INT_ASSG_PRCE_PLAN_INT_INS" />
            <VARIABLE NAME="%%PARM2" VALUE="ETL" />
            <INCOND NAME="SH_CYCLE_AUDIT_START_B01-OK" ODATE="ODAT" AND_OR="A" />
            <INCOND NAME="JB_PRD_PRDCTLGREL_REF_B01-OK" ODATE="ODAT" AND_OR="A" />
            <OUTCOND NAME="JB_AP_ASSG_PRCE_PLAN_INT_B01-OK" ODATE="ODAT" SIGN="+" />
        </JOB>
        <JOB JOBISN="2" APPLICATION="SH_AP_INT_B01" SUB_APPLICATION="SH_AP_INT_B01" MEMNAME="ods_script_etl_wrapper.ksh" JOBNAME="JB_AP_ASSG_PROD_INT_B01" CREATED_BY="xyz" RUN_AS="abc" CRITICAL="0" TASKTYPE="Job" CYCLIC="0" NODEID="ser455" INTERVAL="00001M" MEMLIB="/c/bin/" CONFIRM="0" RETRO="0" MAXWAIT="0" MAXRERUN="0" AUTOARCH="1" MAXDAYS="0" MAXRUNS="0" DAYS="ALL" JAN="1" FEB="1" MAR="1" APR="1" MAY="1" JUN="1" JUL="1" AUG="1" SEP="1" OCT="1" NOV="1" DEC="1" DAYS_AND_OR="O" SHIFT="Ignore Job" SHIFTNUM="+00" SYSDB="1" IND_CYCLIC="S" CREATION_USER="emuser" CREATION_DATE="20151120" CREATION_TIME="160829" CHANGE_USERID="user" CHANGE_DATE="20151202" CHANGE_TIME="185636" RULE_BASED_CALENDAR_RELATIONSHIP="O" APPL_TYPE="OS" MULTY_AGENT="N" USE_INSTREAM_JCL="N" VERSION_OPCODE="N" IS_CURRENT_VERSION="Y" VERSION_SERIAL="3" VERSION_HOST="PIGAUTAM02" CYCLIC_TOLERANCE="0" CYCLIC_TYPE="C" PARENT_FOLDER="SH_AP_INT_B01">
            <VARIABLE NAME="%%PARM1" VALUE="merge_ap_assg_prod_int.sql" />
            <VARIABLE NAME="%%PARM2" VALUE="SQL" />
            <INCOND NAME="SH_CYCLE_AUDIT_START_B01-OK" ODATE="ODAT" AND_OR="A" />
            <INCOND NAME="JB_PRODCTLG_SH_PRD_REF_B01-OK" ODATE="ODAT" AND_OR="A" />
            <INCOND NAME="JB_AP_TBCMMT_TERM_LKP_B01-OK" ODATE="ODAT" AND_OR="A" />
            <OUTCOND NAME="JB_AP_ASSG_PROD_INT_B01-OK" ODATE="ODAT" SIGN="+" />
        </JOB>
</FOLDER>

but i am getting like below. Remember i want folder starting <FOLDER and closing with </FOLDER> . There are many folders, and every folder having many jobs.

<JOB JOBISN="1" APPLICATION="SH_AP_INT_B01" SUB_APPLICATION="SH_AP_INT_B01" MEMNAME="ods_script_etl_wrapper.ksh" JOBNAME="JB_AP_ASSG_PRCE_PLAN_INT_B01" CREATED_BY="xyz" RUN_AS="abc" CRITICAL="0" TASKTYPE="Job" CYCLIC="0" NODEID="ser455" INTERVAL="00001M" MEMLIB="/c/bin/" CONFIRM="0" RETRO="0" MAXWAIT="0" MAXRERUN="0" AUTOARCH="1" MAXDAYS="0" MAXRUNS="0" DAYS="ALL" JAN="1" FEB="1" MAR="1" APR="1" MAY="1" JUN="1" JUL="1" AUG="1" SEP="1" OCT="1" NOV="1" DEC="1" DAYS_AND_OR="O" SHIFT="Ignore Job" SHIFTNUM="+00" SYSDB="1" IND_CYCLIC="S" CREATION_USER="emuser" CREATION_DATE="20151120" CREATION_TIME="160829" CHANGE_USERID="user" CHANGE_DATE="20151202" CHANGE_TIME="185636" RULE_BASED_CALENDAR_RELATIONSHIP="O" APPL_TYPE="OS" MULTY_AGENT="N" USE_INSTREAM_JCL="N" VERSION_OPCODE="N" IS_CURRENT_VERSION="Y" VERSION_SERIAL="3" VERSION_HOST="PIGAUTAM02" CYCLIC_TOLERANCE="0" CYCLIC_TYPE="C" PARENT_FOLDER="SH_AP_INT_B01">
            <VARIABLE NAME="%%PARM1" VALUE="wf_AP_INT_ASSG_PRCE_PLAN_INT_INS" />
            <VARIABLE NAME="%%PARM2" VALUE="ETL" />
            <INCOND NAME="SH_CYCLE_AUDIT_START_B01-OK" ODATE="ODAT" AND_OR="A" />
            <INCOND NAME="JB_PRD_PRDCTLGREL_REF_B01-OK" ODATE="ODAT" AND_OR="A" />
            <OUTCOND NAME="JB_AP_ASSG_PRCE_PLAN_INT_B01-OK" ODATE="ODAT" SIGN="+" />
        </JOB>

<JOB JOBISN="2" APPLICATION="SH_AP_INT_B01" SUB_APPLICATION="SH_AP_INT_B01" MEMNAME="ods_script_etl_wrapper.ksh" JOBNAME="JB_AP_ASSG_PROD_INT_B01" CREATED_BY="xyz" RUN_AS="abc" CRITICAL="0" TASKTYPE="Job" CYCLIC="0" NODEID="ser455" INTERVAL="00001M" MEMLIB="/c/bin/" CONFIRM="0" RETRO="0" MAXWAIT="0" MAXRERUN="0" AUTOARCH="1" MAXDAYS="0" MAXRUNS="0" DAYS="ALL" JAN="1" FEB="1" MAR="1" APR="1" MAY="1" JUN="1" JUL="1" AUG="1" SEP="1" OCT="1" NOV="1" DEC="1" DAYS_AND_OR="O" SHIFT="Ignore Job" SHIFTNUM="+00" SYSDB="1" IND_CYCLIC="S" CREATION_USER="emuser" CREATION_DATE="20151120" CREATION_TIME="160829" CHANGE_USERID="user" CHANGE_DATE="20151202" CHANGE_TIME="185636" RULE_BASED_CALENDAR_RELATIONSHIP="O" APPL_TYPE="OS" MULTY_AGENT="N" USE_INSTREAM_JCL="N" VERSION_OPCODE="N" IS_CURRENT_VERSION="Y" VERSION_SERIAL="3" VERSION_HOST="PIGAUTAM02" CYCLIC_TOLERANCE="0" CYCLIC_TYPE="C" PARENT_FOLDER="SH_AP_INT_B01">
            <VARIABLE NAME="%%PARM1" VALUE="merge_ap_assg_prod_int.sql" />
            <VARIABLE NAME="%%PARM2" VALUE="SQL" />
            <INCOND NAME="SH_CYCLE_AUDIT_START_B01-OK" ODATE="ODAT" AND_OR="A" />
            <INCOND NAME="JB_PRODCTLG_SH_PRD_REF_B01-OK" ODATE="ODAT" AND_OR="A" />
            <INCOND NAME="JB_AP_TBCMMT_TERM_LKP_B01-OK" ODATE="ODAT" AND_OR="A" />
            <OUTCOND NAME="JB_AP_ASSG_PROD_INT_B01-OK" ODATE="ODAT" SIGN="+" />
        </JOB>

Kindly help me on this.

The awk that I have (on OS X) only processes the first character in RS , but the following seems to do what you want (and should work with any version of awk , except use /usr/xpg4/bin/awk or nawk if you are using a Solaris/SunOS system):

awk '
FNR == NR {
	jn[$2]
	next
}
/<FOLDER / {
	nf = 0
	folder = $0
	next
}
/<JOB / && $6 in jn {
	copy = 1
	if(!nf++)
		print folder
}
copy
/<\/JOB>/ {
	copy = 0
	next
}
/<\/FOLDER/ && nf {
	print
}' file1 file2
1 Like

Hello Mr Don Thanks for this useful code. But could you please help me with some doubts.

(1)/<FOLDER / {
	nf = 0
	folder = $0
	next
}
(2)/<JOB / && $6 in jn {
	copy = 1
	if(!nf++)
		print folder
}
copy
(3)/<\/JOB>/ {
	copy = 0
	next
}
(4)/<\/FOLDER/ && nf {
	print
}' file1 file2
  1. Is "copy" variable, Part of block (2) ?, which will have a value 1 .That will print as default.
  2. What is the purpose of block (3) . As no print statement or default 1 is present.
    3.Which part of code is printing other lines like <VARIABLE....<INCOND......
    Thanks,

In awk , the blocks you are talking about have the format:

    pattern { action }

and as you know if the { action } part is missing, it defaults to:

    pattern { print $0 }

So, the line:

copy

is a block of its own that could be rewritten as:

copy {	print
}

(since print with no arguments defaults to print $0 ). When the variable copy is set to a non-zero, non-empty value, that line causes the current line to be printed. And when copy is an empty string (the default value until it is set to something else) or zero that block is a no-op.

Block (3) turns off copying input lines to the output after the end of a set of <JOB> tags:

<JOB...>
...
</JOB>

has been conditionally printed.

The line in the code:

copy

prints everything this script prints except the opening <FOLDER...> tag line that is printed in block (2) the 1st time a job is found that is to be printed in that folder and the </FOLDER> line that is printed in block (4) iff one or more jobs were found to be printed in the preceding folder.

2 Likes