Hi, i have a file like this:
<Iteration>
<Iteration_iter-num>3</Iteration_iter-num>
<Iteration_query-ID>lcl|3_0</Iteration_query-ID>
<Iteration_query-def>G383C4U01EQA0A length=197</Iteration_query-def>
<Iteration_query-len>197</Iteration_query-len>
<Iteration_stat>
<Statistics>
<Statistics_db-num>31601460</Statistics_db-num>
<Statistics_db-len>10937649309</Statistics_db-len>
<Statistics_hsp-len>0</Statistics_hsp-len>
<Statistics_eff-space>0</Statistics_eff-space>
<Statistics_kappa>0.041</Statistics_kappa>
<Statistics_lambda>0.267</Statistics_lambda>
<Statistics_entropy>0.14</Statistics_entropy>
</Statistics>
</Iteration_stat>
<Iteration_message>No hits found</Iteration_message>
</Iteration>
<Iteration>
<Iteration_iter-num>4</Iteration_iter-num>
<Iteration_query-ID>lcl|4_0</Iteration_query-ID>
<Iteration_query-def>G383C4U01AUSDH length=64</Iteration_query-def>
<Iteration_query-len>64</Iteration_query-len>
<Iteration_stat>
<Statistics>
<Statistics_db-num>31601460</Statistics_db-num>
<Statistics_db-len>10937649309</Statistics_db-len>
<Statistics_hsp-len>0</Statistics_hsp-len>
<Statistics_eff-space>0</Statistics_eff-space>
<Statistics_kappa>0.041</Statistics_kappa>
<Statistics_lambda>0.267</Statistics_lambda>
<Statistics_entropy>0.14</Statistics_entropy>
</Statistics>
</Iteration_stat>
<Iteration_message>No hits found</Iteration_message>
</Iteration>
<Iteration>
<Iteration_iter-num>5</Iteration_iter-num>
<Iteration_query-ID>lcl|5_0</Iteration_query-ID>
<Iteration_query-def>G383C4U01DPLAS length=224</Iteration_query-def>
<Iteration_query-len>224</Iteration_query-len>
<Iteration_hits>
<Hit>
<Hit_num>1</Hit_num>
<Hit_id>gi|460414860|ref|XP_004252780.1|</Hit_id>
<Hit_def>PREDICTED: exocyst complex component SEC3A-like [Solanum lycopersicum]</Hit_def>
<Hit_accession>XP_004252780</Hit_accession>
<Hit_len>888</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>60.077</Hsp_bit-score>
<Hsp_score>144</Hsp_score>
<Hsp_evalue>1.95683e-09</Hsp_evalue>
<Hsp_query-from>61</Hsp_query-from>
<Hsp_query-to>222</Hsp_query-to>
<Hsp_hit-from>30</Hsp_hit-from>
<Hsp_hit-to>84</Hsp_hit-to>
<Hsp_query-frame>1</Hsp_query-frame>
<Hsp_identity>36</Hsp_identity>
<Hsp_positive>37</Hsp_positive>
<Hsp_gaps>2</Hsp_gaps>
<Hsp_align-len>56</Hsp_align-len>
<Hsp_qseq>IRVAKSRGIWESTAN--RSPNAKPRFVAISTKAKATTN*KHFSES*KYSTGGVLEP</Hsp_qseq>
<Hsp_hseq>IRVAKSRGIWAKTGKLGRSHTAKPRVIAISTKAKGQRT-KAFLHVLKYSTGGVLEP</Hsp_hseq>
<Hsp_midline>IRVAKSRGIW T RS AKPR +AISTKAK K F KYSTGGVLEP</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
<Hit>
<Hit_num>2</Hit_num>
<Hit_id>gi|225458426|ref|XP_002283704.1|</Hit_id>
<Hit_def>PREDICTED: exocyst complex component SEC3A isoform 1 [Vitis vinifera] >gi|302142418|emb|CBI19621.3| unnamed protein product [Vitis vinifera]</Hit_def>
<Hit_accession>XP_002283704</Hit_accession>
<Hit_len>886</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>56.6102</Hsp_bit-score>
<Hsp_score>135</Hsp_score>
<Hsp_evalue>3.26752e-08</Hsp_evalue>
<Hsp_query-from>61</Hsp_query-from>
<Hsp_query-to>222</Hsp_query-to>
<Hsp_hit-from>30</Hsp_hit-from>
<Hsp_hit-to>83</Hsp_hit-to>
<Hsp_query-frame>1</Hsp_query-frame>
<Hsp_identity>34</Hsp_identity>
<Hsp_positive>37</Hsp_positive>
<Hsp_gaps>1</Hsp_gaps>
<Hsp_align-len>55</Hsp_align-len>
<Hsp_qseq>IRVAKSRGIWESTANRSPN-AKPRFVAISTKAKATTN*KHFSES*KYSTGGVLEP</Hsp_qseq>
<Hsp_hseq>IRVAKSRGIWGKSGKLGRNMAKPRVLALSTKAKAQRT-KAFLRVLKYSTGGVLEP</Hsp_hseq>
<Hsp_midline>IRVAKSRGIW + N AKPR +A+STKAKA K F KYSTGGVLEP</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
<Hit>
<Hit_num>3</Hit_num>
<Hit_id>gi|359492097|ref|XP_003634363.1|</Hit_id>
<Hit_def>PREDICTED: exocyst complex component SEC3A isoform 2 [Vitis vinifera]</Hit_def>
<Hit_accession>XP_003634363</Hit_accession>
<Hit_len>887</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>56.6102</Hsp_bit-score>
<Hsp_score>135</Hsp_score>
<Hsp_evalue>3.26763e-08</Hsp_evalue>
<Hsp_query-from>61</Hsp_query-from>
<Hsp_query-to>222</Hsp_query-to>
<Hsp_hit-from>30</Hsp_hit-from>
<Hsp_hit-to>83</Hsp_hit-to>
<Hsp_query-frame>1</Hsp_query-frame>
<Hsp_identity>34</Hsp_identity>
<Hsp_positive>37</Hsp_positive>
<Hsp_gaps>1</Hsp_gaps>
<Hsp_align-len>55</Hsp_align-len>
<Hsp_qseq>IRVAKSRGIWESTANRSPN-AKPRFVAISTKAKATTN*KHFSES*KYSTGGVLEP</Hsp_qseq>
<Hsp_hseq>IRVAKSRGIWGKSGKLGRNMAKPRVLALSTKAKAQRT-KAFLRVLKYSTGGVLEP</Hsp_hseq>
<Hsp_midline>IRVAKSRGIW + N AKPR +A+STKAKA K F KYSTGGVLEP</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
<Hit>
<Hit_num>4</Hit_num>
<Hit_id>gi|255538520|ref|XP_002510325.1|</Hit_id>
<Hit_def>exocyst complex component sec3, putative [Ricinus communis] >gi|223551026|gb|EEF52512.1| exocyst complex component sec3, putative [Ricinus communis]</Hit_def>
<Hit_accession>XP_002510325</Hit_accession>
<Hit_len>889</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>53.9138</Hsp_bit-score>
<Hsp_score>128</Hsp_score>
<Hsp_evalue>2.91784e-07</Hsp_evalue>
<Hsp_query-from>61</Hsp_query-from>
<Hsp_query-to>222</Hsp_query-to>
<Hsp_hit-from>30</Hsp_hit-from>
<Hsp_hit-to>83</Hsp_hit-to>
<Hsp_query-frame>1</Hsp_query-frame>
<Hsp_identity>32</Hsp_identity>
<Hsp_positive>36</Hsp_positive>
<Hsp_gaps>1</Hsp_gaps>
<Hsp_align-len>55</Hsp_align-len>
<Hsp_qseq>IRVAKSRGIWESTANRSPN-AKPRFVAISTKAKATTN*KHFSES*KYSTGGVLEP</Hsp_qseq>
<Hsp_hseq>IRVAKSRGIWGKSGKLGRQMAKPRVLALSTKSKGTRT-KAFLRVLKYSTGGVLEP</Hsp_hseq>
<Hsp_midline>IRVAKSRGIW + AKPR +A+STK+K T K F KYSTGGVLEP</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
<Hit>
<Hit_num>5</Hit_num>
<Hit_id>gi|449460129|ref|XP_004147798.1|</Hit_id>
<Hit_def>PREDICTED: exocyst complex component SEC3A-like [Cucumis sativus]</Hit_def>
<Hit_accession>XP_004147798</Hit_accession>
<Hit_len>883</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>52.7582</Hsp_bit-score>
<Hsp_score>125</Hsp_score>
<Hsp_evalue>7.46528e-07</Hsp_evalue>
<Hsp_query-from>61</Hsp_query-from>
<Hsp_query-to>222</Hsp_query-to>
<Hsp_hit-from>30</Hsp_hit-from>
<Hsp_hit-to>84</Hsp_hit-to>
<Hsp_query-frame>1</Hsp_query-frame>
<Hsp_identity>32</Hsp_identity>
<Hsp_positive>35</Hsp_positive>
<Hsp_gaps>2</Hsp_gaps>
<Hsp_align-len>56</Hsp_align-len>
<Hsp_qseq>IRVAKSRGIWESTA--NRSPNAKPRFVAISTKAKATTN*KHFSES*KYSTGGVLEP</Hsp_qseq>
<Hsp_hseq>IRVAKSRGIWGKSGMLGRQQMAKPRVLALSTKEKGPRT-KAFLRVLKYSTGGVLEP</Hsp_hseq>
<Hsp_midline>IRVAKSRGIW + R AKPR +A+STK K K F KYSTGGVLEP</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
</Iteration_hits>
<Iteration_stat>
<Statistics>
<Statistics_db-num>31601460</Statistics_db-num>
<Statistics_db-len>10937649309</Statistics_db-len>
<Statistics_hsp-len>0</Statistics_hsp-len>
<Statistics_eff-space>0</Statistics_eff-space>
Every inquiry starts from <Iteration>, and ends with </Iteration>
I want to extract only the information from certain inquires, for example, from a B file:
G383C4U01AUSDH
G383C4U01DPLAS
..
How could i do this?
Thanks.
Quick and simple way:
awk '$1 == "Iteration_query-def" { print $2 }' RS="<" FS=">" iteration.xml
Actually both files are very large, and the inquires in file B are not continuous.
In what way does my solution not work for you?
In what way does your data differ from what you posted?
If you mean that you only want to extract information from between <Iteration> tags:
awk '/^Iteration/ { P=1 } ; P && ($1 == "Iteration_query-def") { print $2 } ; /^\/Iteration/ { P=0 }' RS="<" FS=">" iteration.xml
Sorry, i think i didn't say it clearly.
I have a xml file A like this:
<Iteration>
<Iteration_iter-num>3</Iteration_iter-num>
<Iteration_query-ID>lcl|3_0</Iteration_query-ID>
<Iteration_query-def>G383C4U01EQA0A length=197</Iteration_query-def>
<Iteration_query-len>197</Iteration_query-len>
<Iteration_stat>
<Statistics>
<Statistics_db-num>31601460</Statistics_db-num>
<Statistics_db-len>10937649309</Statistics_db-len>
<Statistics_hsp-len>0</Statistics_hsp-len>
<Statistics_eff-space>0</Statistics_eff-space>
<Statistics_kappa>0.041</Statistics_kappa>
<Statistics_lambda>0.267</Statistics_lambda>
<Statistics_entropy>0.14</Statistics_entropy>
</Statistics>
</Iteration_stat>
<Iteration_message>No hits found</Iteration_message>
</Iteration>
<Iteration>
<Iteration_iter-num>4</Iteration_iter-num>
<Iteration_query-ID>lcl|4_0</Iteration_query-ID>
<Iteration_query-def>G383C4U01AUSDH length=64</Iteration_query-def>
<Iteration_query-len>64</Iteration_query-len>
<Iteration_stat>
<Statistics>
<Statistics_db-num>31601460</Statistics_db-num>
<Statistics_db-len>10937649309</Statistics_db-len>
<Statistics_hsp-len>0</Statistics_hsp-len>
<Statistics_eff-space>0</Statistics_eff-space>
<Statistics_kappa>0.041</Statistics_kappa>
<Statistics_lambda>0.267</Statistics_lambda>
<Statistics_entropy>0.14</Statistics_entropy>
</Statistics>
</Iteration_stat>
<Iteration_message>No hits found</Iteration_message>
</Iteration>
<Iteration>
<Iteration_iter-num>5</Iteration_iter-num>
<Iteration_query-ID>lcl|5_0</Iteration_query-ID>
<Iteration_query-def>G383C4U01DPLAS length=224</Iteration_query-def>
<Iteration_query-len>224</Iteration_query-len>
<Iteration_hits>
<Hit>
<Hit_num>1</Hit_num>
<Hit_id>gi|460414860|ref|XP_004252780.1|</Hit_id>
<Hit_def>PREDICTED: exocyst complex component SEC3A-like [Solanum lycopersicum]</Hit_def>
<Hit_accession>XP_004252780</Hit_accession>
<Hit_len>888</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>60.077</Hsp_bit-score>
<Hsp_score>144</Hsp_score>
<Hsp_evalue>1.95683e-09</Hsp_evalue>
<Hsp_query-from>61</Hsp_query-from>
<Hsp_query-to>222</Hsp_query-to>
<Hsp_hit-from>30</Hsp_hit-from>
<Hsp_hit-to>84</Hsp_hit-to>
<Hsp_query-frame>1</Hsp_query-frame>
<Hsp_identity>36</Hsp_identity>
<Hsp_positive>37</Hsp_positive>
<Hsp_gaps>2</Hsp_gaps>
<Hsp_align-len>56</Hsp_align-len>
<Hsp_qseq>IRVAKSRGIWESTAN--RSPNAKPRFVAISTKAKATTN*KHFSES*KYSTGGVLEP</Hsp_qseq>
<Hsp_hseq>IRVAKSRGIWAKTGKLGRSHTAKPRVIAISTKAKGQRT-KAFLHVLKYSTGGVLEP</Hsp_hseq>
<Hsp_midline>IRVAKSRGIW T RS AKPR +AISTKAK K F KYSTGGVLEP</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
<Hit>
<Hit_num>2</Hit_num>
<Hit_id>gi|225458426|ref|XP_002283704.1|</Hit_id>
<Hit_def>PREDICTED: exocyst complex component SEC3A isoform 1 [Vitis vinifera] >gi|302142418|emb|CBI19621.3| unnamed protein product [Vitis vinifera]</Hit_def>
<Hit_accession>XP_002283704</Hit_accession>
<Hit_len>886</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>56.6102</Hsp_bit-score>
<Hsp_score>135</Hsp_score>
<Hsp_evalue>3.26752e-08</Hsp_evalue>
<Hsp_query-from>61</Hsp_query-from>
<Hsp_query-to>222</Hsp_query-to>
<Hsp_hit-from>30</Hsp_hit-from>
<Hsp_hit-to>83</Hsp_hit-to>
<Hsp_query-frame>1</Hsp_query-frame>
<Hsp_identity>34</Hsp_identity>
<Hsp_positive>37</Hsp_positive>
<Hsp_gaps>1</Hsp_gaps>
<Hsp_align-len>55</Hsp_align-len>
<Hsp_qseq>IRVAKSRGIWESTANRSPN-AKPRFVAISTKAKATTN*KHFSES*KYSTGGVLEP</Hsp_qseq>
<Hsp_hseq>IRVAKSRGIWGKSGKLGRNMAKPRVLALSTKAKAQRT-KAFLRVLKYSTGGVLEP</Hsp_hseq>
<Hsp_midline>IRVAKSRGIW + N AKPR +A+STKAKA K F KYSTGGVLEP</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
<Hit>
<Hit_num>3</Hit_num>
<Hit_id>gi|359492097|ref|XP_003634363.1|</Hit_id>
<Hit_def>PREDICTED: exocyst complex component SEC3A isoform 2 [Vitis vinifera]</Hit_def>
<Hit_accession>XP_003634363</Hit_accession>
<Hit_len>887</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>56.6102</Hsp_bit-score>
<Hsp_score>135</Hsp_score>
<Hsp_evalue>3.26763e-08</Hsp_evalue>
<Hsp_query-from>61</Hsp_query-from>
<Hsp_query-to>222</Hsp_query-to>
<Hsp_hit-from>30</Hsp_hit-from>
<Hsp_hit-to>83</Hsp_hit-to>
<Hsp_query-frame>1</Hsp_query-frame>
<Hsp_identity>34</Hsp_identity>
<Hsp_positive>37</Hsp_positive>
<Hsp_gaps>1</Hsp_gaps>
<Hsp_align-len>55</Hsp_align-len>
<Hsp_qseq>IRVAKSRGIWESTANRSPN-AKPRFVAISTKAKATTN*KHFSES*KYSTGGVLEP</Hsp_qseq>
<Hsp_hseq>IRVAKSRGIWGKSGKLGRNMAKPRVLALSTKAKAQRT-KAFLRVLKYSTGGVLEP</Hsp_hseq>
<Hsp_midline>IRVAKSRGIW + N AKPR +A+STKAKA K F KYSTGGVLEP</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
<Hit>
<Hit_num>4</Hit_num>
<Hit_id>gi|255538520|ref|XP_002510325.1|</Hit_id>
<Hit_def>exocyst complex component sec3, putative [Ricinus communis] >gi|223551026|gb|EEF52512.1| exocyst complex component sec3, putative [Ricinus communis]</Hit_def>
<Hit_accession>XP_002510325</Hit_accession>
<Hit_len>889</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>53.9138</Hsp_bit-score>
<Hsp_score>128</Hsp_score>
<Hsp_evalue>2.91784e-07</Hsp_evalue>
<Hsp_query-from>61</Hsp_query-from>
<Hsp_query-to>222</Hsp_query-to>
<Hsp_hit-from>30</Hsp_hit-from>
<Hsp_hit-to>83</Hsp_hit-to>
<Hsp_query-frame>1</Hsp_query-frame>
<Hsp_identity>32</Hsp_identity>
<Hsp_positive>36</Hsp_positive>
<Hsp_gaps>1</Hsp_gaps>
<Hsp_align-len>55</Hsp_align-len>
<Hsp_qseq>IRVAKSRGIWESTANRSPN-AKPRFVAISTKAKATTN*KHFSES*KYSTGGVLEP</Hsp_qseq>
<Hsp_hseq>IRVAKSRGIWGKSGKLGRQMAKPRVLALSTKSKGTRT-KAFLRVLKYSTGGVLEP</Hsp_hseq>
<Hsp_midline>IRVAKSRGIW + AKPR +A+STK+K T K F KYSTGGVLEP</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
<Hit>
<Hit_num>5</Hit_num>
<Hit_id>gi|449460129|ref|XP_004147798.1|</Hit_id>
<Hit_def>PREDICTED: exocyst complex component SEC3A-like [Cucumis sativus]</Hit_def>
<Hit_accession>XP_004147798</Hit_accession>
<Hit_len>883</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>52.7582</Hsp_bit-score>
<Hsp_score>125</Hsp_score>
<Hsp_evalue>7.46528e-07</Hsp_evalue>
<Hsp_query-from>61</Hsp_query-from>
<Hsp_query-to>222</Hsp_query-to>
<Hsp_hit-from>30</Hsp_hit-from>
<Hsp_hit-to>84</Hsp_hit-to>
<Hsp_query-frame>1</Hsp_query-frame>
<Hsp_identity>32</Hsp_identity>
<Hsp_positive>35</Hsp_positive>
<Hsp_gaps>2</Hsp_gaps>
<Hsp_align-len>56</Hsp_align-len>
<Hsp_qseq>IRVAKSRGIWESTA--NRSPNAKPRFVAISTKAKATTN*KHFSES*KYSTGGVLEP</Hsp_qseq>
<Hsp_hseq>IRVAKSRGIWGKSGMLGRQQMAKPRVLALSTKEKGPRT-KAFLRVLKYSTGGVLEP</Hsp_hseq>
<Hsp_midline>IRVAKSRGIW + R AKPR +A+STK K K F KYSTGGVLEP</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
</Iteration_hits>
<Iteration_stat>
<Statistics>
<Statistics_db-num>31601460</Statistics_db-num>
<Statistics_db-len>10937649309</Statistics_db-len>
<Statistics_hsp-len>0</Statistics_hsp-len>
<Statistics_eff-space>0</Statistics_eff-space>
....
and a B file contain the names of interest:
G383C4U01AUSDH
G383C4U01DPLAS
..
I wanna get a C file like this:
<Iteration>
<Iteration_iter-num>4</Iteration_iter-num>
<Iteration_query-ID>lcl|4_0</Iteration_query-ID>
<Iteration_query-def>G383C4U01AUSDH length=64</Iteration_query-def>
<Iteration_query-len>64</Iteration_query-len>
<Iteration_stat>
<Statistics>
<Statistics_db-num>31601460</Statistics_db-num>
<Statistics_db-len>10937649309</Statistics_db-len>
<Statistics_hsp-len>0</Statistics_hsp-len>
<Statistics_eff-space>0</Statistics_eff-space>
<Statistics_kappa>0.041</Statistics_kappa>
<Statistics_lambda>0.267</Statistics_lambda>
<Statistics_entropy>0.14</Statistics_entropy>
</Statistics>
</Iteration_stat>
<Iteration_message>No hits found</Iteration_message>
</Iteration>
<Iteration>
<Iteration_iter-num>5</Iteration_iter-num>
<Iteration_query-ID>lcl|5_0</Iteration_query-ID>
<Iteration_query-def>G383C4U01DPLAS length=224</Iteration_query-def>
<Iteration_query-len>224</Iteration_query-len>
<Iteration_hits>
<Hit>
<Hit_num>1</Hit_num>
<Hit_id>gi|460414860|ref|XP_004252780.1|</Hit_id>
<Hit_def>PREDICTED: exocyst complex component SEC3A-like [Solanum lycopersicum]</Hit_def>
<Hit_accession>XP_004252780</Hit_accession>
<Hit_len>888</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>60.077</Hsp_bit-score>
<Hsp_score>144</Hsp_score>
<Hsp_evalue>1.95683e-09</Hsp_evalue>
<Hsp_query-from>61</Hsp_query-from>
<Hsp_query-to>222</Hsp_query-to>
<Hsp_hit-from>30</Hsp_hit-from>
<Hsp_hit-to>84</Hsp_hit-to>
<Hsp_query-frame>1</Hsp_query-frame>
<Hsp_identity>36</Hsp_identity>
<Hsp_positive>37</Hsp_positive>
<Hsp_gaps>2</Hsp_gaps>
<Hsp_align-len>56</Hsp_align-len>
<Hsp_qseq>IRVAKSRGIWESTAN--RSPNAKPRFVAISTKAKATTN*KHFSES*KYSTGGVLEP</Hsp_qseq>
<Hsp_hseq>IRVAKSRGIWAKTGKLGRSHTAKPRVIAISTKAKGQRT-KAFLHVLKYSTGGVLEP</Hsp_hseq>
<Hsp_midline>IRVAKSRGIW T RS AKPR +AISTKAK K F KYSTGGVLEP</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
<Hit>
<Hit_num>2</Hit_num>
<Hit_id>gi|225458426|ref|XP_002283704.1|</Hit_id>
<Hit_def>PREDICTED: exocyst complex component SEC3A isoform 1 [Vitis vinifera] >gi|302142418|emb|CBI19621.3| unnamed protein product [Vitis vinifera]</Hit_def>
<Hit_accession>XP_002283704</Hit_accession>
<Hit_len>886</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>56.6102</Hsp_bit-score>
<Hsp_score>135</Hsp_score>
<Hsp_evalue>3.26752e-08</Hsp_evalue>
<Hsp_query-from>61</Hsp_query-from>
<Hsp_query-to>222</Hsp_query-to>
<Hsp_hit-from>30</Hsp_hit-from>
<Hsp_hit-to>83</Hsp_hit-to>
<Hsp_query-frame>1</Hsp_query-frame>
<Hsp_identity>34</Hsp_identity>
<Hsp_positive>37</Hsp_positive>
<Hsp_gaps>1</Hsp_gaps>
<Hsp_align-len>55</Hsp_align-len>
<Hsp_qseq>IRVAKSRGIWESTANRSPN-AKPRFVAISTKAKATTN*KHFSES*KYSTGGVLEP</Hsp_qseq>
<Hsp_hseq>IRVAKSRGIWGKSGKLGRNMAKPRVLALSTKAKAQRT-KAFLRVLKYSTGGVLEP</Hsp_hseq>
<Hsp_midline>IRVAKSRGIW + N AKPR +A+STKAKA K F KYSTGGVLEP</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
<Hit>
<Hit_num>3</Hit_num>
<Hit_id>gi|359492097|ref|XP_003634363.1|</Hit_id>
<Hit_def>PREDICTED: exocyst complex component SEC3A isoform 2 [Vitis vinifera]</Hit_def>
<Hit_accession>XP_003634363</Hit_accession>
<Hit_len>887</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>56.6102</Hsp_bit-score>
<Hsp_score>135</Hsp_score>
<Hsp_evalue>3.26763e-08</Hsp_evalue>
<Hsp_query-from>61</Hsp_query-from>
<Hsp_query-to>222</Hsp_query-to>
<Hsp_hit-from>30</Hsp_hit-from>
<Hsp_hit-to>83</Hsp_hit-to>
<Hsp_query-frame>1</Hsp_query-frame>
<Hsp_identity>34</Hsp_identity>
<Hsp_positive>37</Hsp_positive>
<Hsp_gaps>1</Hsp_gaps>
<Hsp_align-len>55</Hsp_align-len>
<Hsp_qseq>IRVAKSRGIWESTANRSPN-AKPRFVAISTKAKATTN*KHFSES*KYSTGGVLEP</Hsp_qseq>
<Hsp_hseq>IRVAKSRGIWGKSGKLGRNMAKPRVLALSTKAKAQRT-KAFLRVLKYSTGGVLEP</Hsp_hseq>
<Hsp_midline>IRVAKSRGIW + N AKPR +A+STKAKA K F KYSTGGVLEP</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
<Hit>
<Hit_num>4</Hit_num>
<Hit_id>gi|255538520|ref|XP_002510325.1|</Hit_id>
<Hit_def>exocyst complex component sec3, putative [Ricinus communis] >gi|223551026|gb|EEF52512.1| exocyst complex component sec3, putative [Ricinus communis]</Hit_def>
<Hit_accession>XP_002510325</Hit_accession>
<Hit_len>889</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>53.9138</Hsp_bit-score>
<Hsp_score>128</Hsp_score>
<Hsp_evalue>2.91784e-07</Hsp_evalue>
<Hsp_query-from>61</Hsp_query-from>
<Hsp_query-to>222</Hsp_query-to>
<Hsp_hit-from>30</Hsp_hit-from>
<Hsp_hit-to>83</Hsp_hit-to>
<Hsp_query-frame>1</Hsp_query-frame>
<Hsp_identity>32</Hsp_identity>
<Hsp_positive>36</Hsp_positive>
<Hsp_gaps>1</Hsp_gaps>
<Hsp_align-len>55</Hsp_align-len>
<Hsp_qseq>IRVAKSRGIWESTANRSPN-AKPRFVAISTKAKATTN*KHFSES*KYSTGGVLEP</Hsp_qseq>
<Hsp_hseq>IRVAKSRGIWGKSGKLGRQMAKPRVLALSTKSKGTRT-KAFLRVLKYSTGGVLEP</Hsp_hseq>
<Hsp_midline>IRVAKSRGIW + AKPR +A+STK+K T K F KYSTGGVLEP</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
<Hit>
<Hit_num>5</Hit_num>
<Hit_id>gi|449460129|ref|XP_004147798.1|</Hit_id>
<Hit_def>PREDICTED: exocyst complex component SEC3A-like [Cucumis sativus]</Hit_def>
<Hit_accession>XP_004147798</Hit_accession>
<Hit_len>883</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>52.7582</Hsp_bit-score>
<Hsp_score>125</Hsp_score>
<Hsp_evalue>7.46528e-07</Hsp_evalue>
<Hsp_query-from>61</Hsp_query-from>
<Hsp_query-to>222</Hsp_query-to>
<Hsp_hit-from>30</Hsp_hit-from>
<Hsp_hit-to>84</Hsp_hit-to>
<Hsp_query-frame>1</Hsp_query-frame>
<Hsp_identity>32</Hsp_identity>
<Hsp_positive>35</Hsp_positive>
<Hsp_gaps>2</Hsp_gaps>
<Hsp_align-len>56</Hsp_align-len>
<Hsp_qseq>IRVAKSRGIWESTA--NRSPNAKPRFVAISTKAKATTN*KHFSES*KYSTGGVLEP</Hsp_qseq>
<Hsp_hseq>IRVAKSRGIWGKSGMLGRQQMAKPRVLALSTKEKGPRT-KAFLRVLKYSTGGVLEP</Hsp_hseq>
<Hsp_midline>IRVAKSRGIW + R AKPR +A+STK K K F KYSTGGVLEP</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
</Iteration_hits>
<Iteration_stat>
<Statistics>
<Statistics_db-num>31601460</Statistics_db-num>
<Statistics_db-len>10937649309</Statistics_db-len>
<Statistics_hsp-len>0</Statistics_hsp-len>
<Statistics_eff-space>0</Statistics_eff-space>
...
Something more like this, then:
$ cat iteration.awk
BEGIN { while((getline <bfile) > 0) D[$1]=1; RS="<"; FS=">" }
$1 == "Iteration_query-def" { split($2, Q, " "); if(D[Q[1]]) M=1 }
$1 == "Iteration" { P=1 }
P { R=R"<"$0 }
$1 == "/Iteration" { if(M) print R; M=P=R="" }
END { if(M) print R }
$ awk -v bfile="b" -f iteration.awk a.xml
Thanks a lot, i will try it
Try worked with given sample :
awk '
function dothis(){
if(f){print s}
s=f=""
}
FNR==NR{
gsub(/[[:space:]]+/,x,$1)
A[$1]=1
next
}
!f{
split($2,S,/ /)
gsub(/[[:space:]]+/,x,S[1])
f = (S[1] in A)
}
{
s = s ? s ORS $0 : $0
if(/<\/Iteration>/)
{
dothis()
}
}
END{
dothis()
}
' FS=">" fileB fileA
1 Like
Thanks!! It works perfect