awk output different between two files

cmccabe · September 21, 2016, 11:41am

The awk below when run using the contents of file , works great with the desired output of

expName
barcodeSampleInfo barcodedSamples

.
However, when the complete file is used (attached) I get different output. It looks like the same data is there but the ordering is off. Both data sets are html and I am not sure why the difference. Thank you :).

file

{"barcodeId": "IonXpress", "barcodedSamples": {"MEV34": {"barcodeSampleInfo": {"IonXpress_004": {"controlSequenceType": "", "description": "", "externalId": "", "hotSpotRegionBedFile": "", "nucleotideType": "DNA", "reference": "hg19", "targetRegionBedFile": "/results/uploads/BED/6/hg19/unmerged/detail/LCHv2_IDP.bed"}}, "barcodes": ["IonXpress_004"]}, "MEV35": {"barcodeSampleInfo": {"IonXpress_005": {"controlSequenceType": "", "description": "", "externalId": "", "hotSpotRegionBedFile": "", "nucleotideType": "DNA", "reference": "hg19", "targetRegionBedFile": "/results/uploads/BED/6/hg19/unmerged/detail/LCHv2_IDP.bed"}}, "barcodes": ["IonXpress_005"]}, "MEV36": {"barcodeSampleInfo": {"IonXpress_006": {"controlSequenceType": "", "description": "", "externalId": "", "hotSpotRegionBedFile": "", "nucleotideType": "DNA", "reference": "hg19", "targetRegionBedFile": "/results/uploads/BED/6/hg19/unmerged/detail/LCHv2_IDP.bed"}}, "barcodes": ["IonXpress_006"]}}, "chipDescription": "540", "chipInstrumentType": "S5", "chipType": "540", "date": "2016-09-20T15:14:38+00:00", "expName": "R_2016_09_20_10_12_41_user_S5-00580-6-Medexome", "flows": 500
{"barcodeId": "IonXpress", "barcodedSamples": {"MEV45": {"barcodeSampleInfo": {"IonXpress_007": {"controlSequenceType": "", "description": "", "externalId": "", "hotSpotRegionBedFile": "", "nucleotideType": "DNA", "reference": "hg19", "targetRegionBedFile": "/results/uploads/BED/6/hg19/unmerged/detail/LCHv2_IDP.bed"}}, "barcodes": ["IonXpress_007"]}, "MEV46": {"barcodeSampleInfo": {"IonXpress_008": {"controlSequenceType": "", "description": "", "externalId": "", "hotSpotRegionBedFile": "", "nucleotideType": "DNA", "reference": "hg19", "targetRegionBedFile": "/results/uploads/BED/6/hg19/unmerged/detail/LCHv2_IDP.bed"}}, "barcodes": ["IonXpress_008"]}, "MEV47": {"barcodeSampleInfo": {"IonXpress_009": {"controlSequenceType": "", "description": "", "externalId": "", "hotSpotRegionBedFile": "", "nucleotideType": "DNA", "reference": "hg19", "targetRegionBedFile": "/results/uploads/BED/6/hg19/unmerged/detail/LCHv2_IDP.bed"}}, "barcodes": ["IonXpress_009"]}}, "chipDescription": "540", "chipInstrumentType": "S5", "chipType": "540", "date": "2016-09-01T18:22:00+00:00", "expName": "R_2016_09_01_13_20_02_user_S5-00580-5-Medexome", "flows": 500,
{"meta": {"limit": 20, "next": null, "offset": 0, "previous": null, "total_count": 8}, "objects": [{"barcodeId": "IonXpress", "barcodedSamples": {"MEV37": {"barcodeSampleInfo": {"IonXpress_007": {"controlSequenceType": "", "description": "", "externalId": "", "hotSpotRegionBedFile": "", "nucleotideType": "DNA", "reference": "hg19", "targetRegionBedFile": "/results/uploads/BED/6/hg19/unmerged/detail/LCHv2_IDP.bed"}}, "barcodes": ["IonXpress_007"]}, "MEV38": {"barcodeSampleInfo": {"IonXpress_008": {"controlSequenceType": "", "description": "", "externalId": "", "hotSpotRegionBedFile": "", "nucleotideType": "DNA", "reference": "hg19", "targetRegionBedFile": "/results/uploads/BED/6/hg19/unmerged/detail/LCHv2_IDP.bed"}}, "barcodes": ["IonXpress_008"]}, "MEV39": {"barcodeSampleInfo": {"IonXpress_009": {"controlSequenceType": "", "description": "", "externalId": "", "hotSpotRegionBedFile": "", "nucleotideType": "DNA", "reference": "hg19", "targetRegionBedFile": "/results/uploads/BED/6/hg19/unmerged/detail/LCHv2_IDP.bed"}}, "barcodes": ["IonXpress_009"]}}, "chipDescription": "540", "chipInstrumentType": "S5", "chipType": "540", "date": "2016-09-20T17:49:30+00:00", "expName": "R_2016_09_20_12_47_36_user_S5-00580-7-Medexome", "flows": 500

output from file (desired)

R_2016_09_20_10_12_41_user_S5-00580-6-Medexome
IonXpress_004 MEV34
IonXpress_005 MEV35
IonXpress_006 MEV36
R_2016_09_01_13_20_02_user_S5-00580-5-Medexome
IonXpress_007 MEV45
IonXpress_008 MEV46
IonXpress_009 MEV47
R_2016_09_20_12_47_36_user_S5-00580-7-Medexome
IonXpress_007 MEV37
IonXpress_008 MEV38
IonXpress_009 MEV39

awk

awk -F"[]\":{}, ]*" '
BEGIN   {for (n=split ("expName", T); n>0; n--) SRCH[T[n]] = n
        }
        {for (i=1; i<NF; i++) if ($i in SRCH) print $(i+1)
        }
        {for (i=1; i<NF; i++) if ($i =="barcodeSampleInfo") print $(i+1)" " $(i-1)
        }
' index.html > out

output using the complete file (attached)

R_2016_09_20_12_47_36_user_S5-00580-7-Medexome
R_2016_09_20_10_12_41_user_S5-00580-6-Medexome
R_2016_09_01_13_20_02_user_S5-00580-5-Medexome
R_2016_09_01_10_24_52_user_S5-00580-4-Medexome
R_2016_08_03_10_42_57_user_S5-00580-2-Medical_Exome
R_2016_08_03_14_04_54_user_S5-00580-3-Medical_Exome
R_2016_07_23_08_40_18_user_S5-00580-1-IQOQ_RUN_Sample_2
R_2016_07_22_17_09_29_user_S5-00580-0-Test_Fragment_Run
IonXpress_007 MEV37
IonXpress_008 MEV38
IonXpress_009 MEV39
IonXpress_004 MEV34
IonXpress_005 MEV35
IonXpress_006 MEV36
IonXpress_007 MEV45
IonXpress_008 MEV46
IonXpress_009 MEV47
IonXpress_004 MEV42
IonXpress_005 MEV43
IonXpress_006 MEV44
IonXpress_001 MEC1
IonXpress_002 MEV40
IonXpress_003 MEV41
IonXpress_001 MEC1
IonXpress_002 MEV40
IonXpress_003 MEV41

RavinderSingh13 · September 21, 2016, 12:27pm

Hello cmccabe,

Could you please try following and let me know if this helps you.

awk 'function remov(a){gsub(/[\{\":,]/,X,a);return a} {if($0 ~ /expName/){getline;W=remov($0);if(Q){print W ORS Q;Q=W=""};};if($0 ~ /MEV/){E=remov($0);getline;getline;Q=Q?Q ORS remov($0) OFS E:remov($0) OFS E;}}' RS=" "   Input_file

Output will be as follows.

R_2016_09_20_10_12_41_user_S5-00580-6-Medexome
IonXpress_004 MEV34
IonXpress_005 MEV35
IonXpress_006 MEV36
R_2016_09_01_13_20_02_user_S5-00580-5-Medexome
IonXpress_007 MEV45
IonXpress_008 MEV46
IonXpress_009 MEV47
R_2016_09_20_12_47_36_user_S5-00580-7-Medexome
IonXpress_007 MEV37
IonXpress_008 MEV38
IonXpress_009 MEV39

EDIT: Adding a non-one liner form of solution too now.

awk 'function remov(a){
                        gsub(/[\{\":,]/,X,a);
                        return a
                      }
                      {
                        if($0 ~ /expName/){
                                                getline;
                                                W=remov($0);
                                                if(Q){
                                                        print W ORS Q;
                                                        Q=W=""
                                                     };
                                          };
                        if($0 ~ /MEV/)    {
                                                E=remov($0);
                                                getline;
                                                getline;
                                                Q=Q?Q ORS remov($0) OFS E:remov($0) OFS E;
                                          }
                      }
    ' RS=" "    Input_file

Thanks,
R. Singh

RudiC · September 21, 2016, 12:34pm

Your file is ONE single line. So the first for loop is executed - and printed - first, then the second. If the output produced is not the desired one, you need to reconsider the script.

Try

awk -F"[]\":{}, ]*" '
        {for (i=1; i<NF; i++)   {if ($i =="expName") print $(i+1)
                                 if ($i =="barcodeSampleInfo") print $(i+1)" " $(i-1)
                                }
        }
' /tmp/index.html