Convert XML to Data File in Shell Script

Hi All,

I will be getting a huge XML file with a lot of records in it. I need to convert it into multiple data files.

SAMPLE XML FILE


<ABSProductCatalog xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
- <ProductSalesHierachy>
- <Portfolios>
- <Portfolio productCode="P1">
  <Attribute name="CatalogProductName" value="Access" /> 
  <Attribute name="Status" value="Active" /> 
  </Portfolio>
- <Portfolio productCode="P2">
  <Attribute name="CatalogProductName" value="Data" /> 
  <Attribute name="Status" value="Active" /> 
  </Portfolio>
- <Portfolio productCode="P3">
  <Attribute name="CatalogProductName" value="Voice" /> 
  <Attribute name="Status" value="Active" /> 
  </Portfolio>
- <Portfolio productCode="P4">
  <Attribute name="CatalogProductName" value="Wireless" /> 
  <Attribute name="Status" value="Active" /> 
  </Portfolio>
  </Portfolios>
- <Families>
- <Family productCode="F1">
  <Attribute name="CatalogProductName" value="Internet Access Services" /> 
  <Attribute name="Status" value="Active" /> 
- <ParentHierarchy>
  <Item productCode="P1" modelType="Portfolio" /> 
  </ParentHierarchy>
  </Family>
- <Family productCode="F2">
  <Attribute name="CatalogProductName" value="Local Access Services" /> 
  <Attribute name="Status" value="Active" /> 
- <ParentHierarchy>
  <Item productCode="P2" modelType="Portfolio" /> 
  </ParentHierarchy>
  </Family>
  </Families>
- <SubFamilies>
- <SubFamily productCode="SF1">
  <Attribute name="CatalogProductName" value="Business Internet service" /> 
  <Attribute name="Status" value="Active" /> 
- <ParentHierarchy>
  <Item productCode="F1" modelType="Family" /> 
  </ParentHierarchy>
  </SubFamily>
  </SubFamilies>
- <ProductRefs>
- <ProductRef productCode="WSP1" modelType="Wireline Sales Product">
  <ActiveFlag>Y</ActiveFlag> 
- <ProductHierarchy>
  <SalesHierarchy family="F1" subFamily="SF1" portfolio="P1" primary="Y" /> 
  <SalesHierarchy family="F2" portfolio="P2" primary="N" /> 
  <FinancialHierarchy quotaBucket="Voice" strategicProdCategory="Local Voice" /> 
  </ProductHierarchy>
  </ProductRef>
- <ProductRef productCode="MSP2" modelType="Handset">
  <ActiveFlag>Y</ActiveFlag> 
- <ProductHierarchy>
  <SalesHierarchy portfolio="P4" primary="Y" /> 
  </ProductHierarchy>
  </ProductRef>
  </ProductRefs>
  </ProductSalesHierachy>
- <Offers>
- <Offer productCode="ABN">
  <OfferName>ABN</OfferName> 
  <OfferDescription>ABN Description</OfferDescription> 
- <Segments>
  <Segment>SCG</Segment> 
  <Segment>PCG</Segment> 
  </Segments>
  <OfferUpdateDate>2009-11-20</OfferUpdateDate> 
  <ActiveFlag>Y</ActiveFlag> 
  </Offer>
- <Offer productCode="OneNet">
  <OfferName>OneNet</OfferName> 
  <OfferDescription>OneNet Description</OfferDescription> 
- <Segments>
  <Segment>SCG</Segment> 
  <Segment>PCG</Segment> 
  <Segment>PCG2</Segment> 
  </Segments>
  <OfferUpdateDate>2009-11-20</OfferUpdateDate> 
  <ActiveFlag>Y</ActiveFlag> 
  </Offer>
  </Offers>
- <Products>
- <Product productCode="WSP1" modelType="Wireline Sales Product">
  <ProductName>AT&T High Speed Internet</ProductName> 
  <ProductDescription>High Speed Internet</ProductDescription> 
  <LegacyCoProdIndicator>SBC</LegacyCoProdIndicator> 
  <RevenueCBLCode>1234B</RevenueCBLCode> 
  <VolumeCBLCode>4567A</VolumeCBLCode> 
  <SAARTServiceIDCode>S1234</SAARTServiceIDCode> 
  <MarginPercentRequired>Y</MarginPercentRequired> 
  <PercentIntl>%234</PercentIntl> 
  <UOM>Each</UOM> 
  <PriceType>OneTime</PriceType> 
  <ProductStatus>Active</ProductStatus> 
  <Compensable>Y</Compensable> 
  <Jurisdiction>Everywhere</Jurisdiction> 
  <ActiveFlag>Y</ActiveFlag> 
- <Availabilities>
  <Availability>SE</Availability> 
  <Availability>E</Availability> 
  </Availabilities>
- <Segments>
  <Segment>SCG</Segment> 
  <Segment>PCG</Segment> 
  </Segments>
  <VDIndicator>Voice</VDIndicator> 
  <PSOCCode>PSOC 1</PSOCCode> 
  <USBilled>Y</USBilled> 
  <MOWBilled>N</MOWBilled> 
  <ProductStartDate>2009-11-20</ProductStartDate> 
  <ProductUpdateDate>2009-11-20</ProductUpdateDate> 
  <ProductEndDate>2010-11-20</ProductEndDate> 
- <AliasNames>
  <AliasName>AT&T HSI</AliasName> 
  <AliasName>AT&T Fast Internet</AliasName> 
  </AliasNames>
- <OfferTypes>
  <OfferType productCode="ABN" endDate="2009-11-20" /> 
  <OfferType productCode="OneNet" /> 
  </OfferTypes>
- <DynamicAttributes>
- <DynamicAttribute dataType="String" defaultValue="2.5 Mbps" name="Speed">
  <AttrValue>1.5 Mbps</AttrValue> 
  <AttrValue>2.5 Mbps</AttrValue> 
  <AttrValue>3.5 Mbps</AttrValue> 
  </DynamicAttribute>
- <DynamicAttribute dataType="String" name="TransportType">
  <AttrValue>T1</AttrValue> 
  </DynamicAttribute>
  </DynamicAttributes>
  </Product>
- <Product productCode="MSP2" modelType="Handset">
  <ProductName>Blackberry Bold</ProductName> 
  <ProductDescription>Blackberry Bold Phone</ProductDescription> 
  <LegacyCoProdIndicator /> 
  <RevenueCBLCode /> 
  <VolumeCBLCode /> 
  <SAARTServiceIDCode /> 
  <MarginPercentRequired /> 
  <PercentIntl /> 
  <UOM>Each</UOM> 
  <PriceType /> 
  <ProductStatus>Active</ProductStatus> 
  <Compensable /> 
  <Jurisdiction /> 
  <ActiveFlag>Y</ActiveFlag> 
- <Availabilities>
  <Availability /> 
  </Availabilities>
- <Segments>
  <Segment>SCG</Segment> 
  <Segment>PCG</Segment> 
  </Segments>
  <VDIndicator>Voice</VDIndicator> 
  <PSOCCode /> 
  <USBilled /> 
  <MOWBilled /> 
  <ProductStartDate>2009-11-20</ProductStartDate> 
  <ProductUpdateDate>2009-11-20</ProductUpdateDate> 
- <AliasNames>
  <AliasName /> 
  </AliasNames>
- <OfferTypes>
  <OfferType productCode="ABN" /> 
  </OfferTypes>
- <DynamicAttributes>
- <DynamicAttribute dataType="String" name="StlmntContractType">
  <AttrValue /> 
  </DynamicAttribute>
- <DynamicAttribute dataType="String" name="BMG 2 year price">
  <AttrValue>20</AttrValue> 
  </DynamicAttribute>
- <DynamicAttribute dataType="String" name="MSRP">
  <AttrValue>40</AttrValue> 
  </DynamicAttribute>
- <DynamicAttribute dataType="String" name="BMGAvailableType">
  <AttrValue /> 
  </DynamicAttribute>
- <DynamicAttribute dataType="String" name="ProductId">
  <AttrValue>123456</AttrValue> 
  </DynamicAttribute>
- <DynamicAttribute dataType="String" name="modelSource">
  <AttrValue>product</AttrValue> 
  </DynamicAttribute>
  </DynamicAttributes>
  </Product>
  </Products>
  <CatalogChanged>Y</CatalogChanged> 
  </ABSProductCatalog>

I need the above XML file broken down to multiple data files. For eg) Portfolio tag elements should go into Portfolio.txt The text file should be as below

P1|Access|Active

Similarly Family Tag elements should go into Family.txt, same with Sub Family and Products as well.

Can you suggest a way to achieve my scenario. I dont have much experience with shell scripting so excuse if my question is naive.

Thanks,
Raghav

Shell scripting and XML processing are not a happy combination. XML's not line-based, for one thing, quite the opposite. It's not really 'raw text' either; it's encoded in a way with a million and one permutations that must all be abided by to guarantee proper behavior. There's huge C libraries just for properly handling XML. It'd be possible to hack a shell script to handle just this one arrangement of it but if they suddenly change where the line breaks are, it'll stop working...

Hi Corona,

I totally understand that. The arrangement is they will not change xml as of now. I just need a way to do the conversion of xml. Pls let me know if possible

---------- Post updated at 11:56 AM ---------- Previous update was at 11:54 AM ----------

I typed the following command

type xml2-config

I got a reply as below

xml2-config is /usr/bin/xml2-config

Does it mean I have xslt installed and i can use it for transformation.

Thanks

I'm trying. Is the data file really as you have it shown -- with those crazy extra dashes in front of nearly every line?

[edit] no, xml2-config doesn't mean you have xslt.

Hi Corona, those dashes are not necessary.. I was just mentioning those dashes as delimitor. we need not have that.. thank you.

A scripting language might make your task easier to accomplish.

Here's an idea that uses regular expressions in Perl -

$
$ ## check if the text files exist
$
$ ls portfolio.txt family.txt subfamily.txt
ls: cannot access portfolio.txt: No such file or directory
ls: cannot access family.txt: No such file or directory
ls: cannot access subfamily.txt: No such file or directory
$
$ ## check the contents of the xml file
$
$ cat -n test.xml
     1  <ABSProductCatalog xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
     2  <ProductSalesHierachy>
     3  <Portfolios>
     4  <Portfolio productCode="P1">
     5  <Attribute name="CatalogProductName" value="Access" />
     6  <Attribute name="Status" value="Active" />
     7  </Portfolio>
     8  <Portfolio productCode="P2">
     9  <Attribute name="CatalogProductName" value="Data" />
    10  <Attribute name="Status" value="Active" />
...
... <snipped a lot of XML content here>
...
   183  <DynamicAttribute dataType="String" name="ProductId">
   184  <AttrValue>123456</AttrValue>
   185  </DynamicAttribute>
   186  <DynamicAttribute dataType="String" name="modelSource">
   187  <AttrValue>product</AttrValue>
   188  </DynamicAttribute>
   189  </DynamicAttributes>
   190  </Product>
   191  </Products>
   192  <CatalogChanged>Y</CatalogChanged>
   193  </ABSProductCatalog>
$
$
$ ## run the Perl script
$
$ perl -lne 'BEGIN {undef $/}
>            while (/<(Portfolio|Family|SubFamily) productCode="(.*?)".*?value="(.*?)".*?value="(.*?)".*?<\/(Portfolio|Family|SubFamily)>/sg) {
>              if ($1 eq "Portfolio") {push @p, "$2|$3|$4"}
>              elsif ($1 eq "Family") {push @f, "$2|$3|$4"}
>              elsif ($1 eq "SubFamily") {push @sf, "$2|$3|$4"}
>            }
>           END {if (@p)  {open(F, ">portfolio.txt"); foreach(@p) {print F $_} close(F)}
>                if (@f)  {open(F, ">family.txt"); foreach(@f) {print F $_} close(F)}
>                if (@sf) {open(F, ">subfamily.txt"); foreach(@sf) {print F $_} close(F)}
>           }
>           ' test.xml
$
$
$ # check if the text files have been generated
$
$ ls -1 portfolio.txt family.txt subfamily.txt
family.txt
portfolio.txt
subfamily.txt
$
$ # they are there now... check their contents
$
$ cat portfolio.txt
P1|Access|Active
P2|Data|Active
P3|Voice|Active
P4|Wireless|Active
$
$ cat family.txt
F1|Internet Access Services|Active
F2|Local Access Services|Active
$
$ cat subfamily.txt
SF1|Business Internet service|Active
$
$

For more power and flexibility, there are Perl modules for XML available at CPAN.

HTH,
tyler_durden

Hi Tyler Thanks a lot for your help. Its working fine and creating the text files

I have a question though, I am trying to create ProductRef.txt and Products.txt just like you did for other three but its not working as expected. Is it because there are some extra tags within those tags.

How do I create files for them as well.

Thanks
Raghav

---------- Post updated at 02:41 PM ---------- Previous update was at 02:37 PM ----------

Tyler,

This is how I modified your code


perl -lne 'BEGIN {undef $/}
            while (/<(Portfolio|Family|SubFamily) productCode="(.*?)".*?value="(.*?)".*?

value="(.*?)".*?<\/(Portfolio|Family|SubFamily)>/sg) {
              if ($1 eq "Portfolio") {push @p, "$2|$3|$4"}
              elsif ($1 eq "Family") {push @f, "$2|$3|$4"}
              elsif ($1 eq "SubFamily") {push @sf, "$2|$3|$4"}
              elsif ($1 eq "ProductRefs") {push @pr, "$2|$3|$4|$4|$5|$6|$7|$8|$9|$10"}
              
            }
           END {if (@p)  {open(F, ">portfolio.txt"); foreach(@p) {print F $_} close(F)}
                if (@f)  {open(F, ">family.txt"); foreach(@f) {print F $_} close(F)}
                if (@sf) {open(F, ">subfamily.txt"); foreach(@sf) {print F $_} close(F)}
                if (@pr) {open(F, ">ProductRefs.txt"); foreach(@pr) {print F $_} close(F)}
           }
           ' CPC.xml

---------- Post updated at 03:57 PM ---------- Previous update was at 02:41 PM ----------

Hi... I have XSLT installed in my unix box. Can you guys suggest me how to create an xsl file for the xml I posted. I think using the xsl approach will be much easier to convert the xml to a data file

It's not working because the regular expression is incorrect for "ProductRefs". It's correct for "Portfolio", "Family" and "SubFamily" though.

perl -lne 'BEGIN {undef $/}
            while (/<(Portfolio|Family|SubFamily) productCode="(.*?)".*?value="(.*?)".*?

value="(.*?)".*?<\/(Portfolio|Family|SubFamily)>/sg) {
              if ($1 eq "Portfolio") {push @p, "$2|$3|$4"}
              elsif ($1 eq "Family") {push @f, "$2|$3|$4"}
              elsif ($1 eq "SubFamily") {push @sf, "$2|$3|$4"}
              elsif ($1 eq "ProductRefs") {push @pr, "$2|$3|$4|$4|$5|$6|$7|$8|$9|$10"}
              
            }
           END {if (@p)  {open(F, ">portfolio.txt"); foreach(@p) {print F $_} close(F)}
                if (@f)  {open(F, ">family.txt"); foreach(@f) {print F $_} close(F)}
                if (@sf) {open(F, ">subfamily.txt"); foreach(@sf) {print F $_} close(F)}
                if (@pr) {open(F, ">ProductRefs.txt"); foreach(@pr) {print F $_} close(F)}
           }
           ' CPC.xml

What are the values of the text in red font ? - $1, $5, $6, ..., $10 ?
If you are unable to answer this question then I'd assume that you are not familiar with regular expressions, and in that case, I'd recommend you to get your concepts clear by studying and practising them.

Alternatively, you may want to check out Perl modules related to XML on CPAN.

tyler_durden

Here is a partial XSLT1.* stylesheet which outputs the Portfolio and Family attribute values into separate text files. You can easily extend it to handle the remaining attribute values that you want to extract.

<?xml version="1.0"?>
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" >

<xsl:template match="*|text()|@*">
    <xsl:document href="portfolio.txt" method="text">
       <xsl:apply-templates select="//Portfolio" />
    </xsl:document>
    <xsl:document href="family.txt" method="text">
       <xsl:apply-templates select="//Family" />
    </xsl:document>
</xsl:template>

<xsl:template match="Portfolio" >
   <xsl:value-of select="@productCode"/>
   <xsl:text>|</xsl:text>
   <xsl:value-of select="./Attribute[1]/@value"/>
   <xsl:text>|</xsl:text>
   <xsl:value-of select="./Attribute[2]/@value"/>
   <xsl:text>& # 10 ;</xsl:text>
</xsl:text>
</xsl:template>

<xsl:template match="Family" >
   <xsl:value-of select="@productCode"/>
   <xsl:text>|</xsl:text>
   <xsl:value-of select="./Attribute[1]/@value"/>
   <xsl:text>|</xsl:text>
   <xsl:value-of select="./Attribute[2]/@value"/>
   <xsl:text>|</xsl:text>
   <xsl:value-of select="./ParentHierarchy/Item/@modelType"/>
   <xsl:text>& # 10 ;</xsl:text>
</xsl:text>
</xsl:template>

</xsl:stylesheet>

If you are using XSLT2, I suggest you use the <result-document> element instead of the <document> element to construct the multiple output documents.

$ cat *.txt
F1|Internet Access Services|Active|Portfolio
F2|Local Access Services|Active|Portfolio
P1|Access|Active
P2|Data|Active
P3|Voice|Active
P4|Wireless|Active
$

Note - Remove the spaces between the "& # 10 ;" in your stylesheet. I had to put the spaces in here as the forum code tags eat up certain XSLT constructs.