Perl text from web

cmccabe · March 25, 2014, 2:23pm

 perl -MLWP::Simple -le '$s=shift;$c=get("http://genetics.emory.edu/egl/tests/view.php?testid=4125/$s/");$c=~/meta content=(.*?)name=\"Genes\"/msg; print length($1),"\t$1"' "Test Description" >output.txt

I am having trouble with this code: Can it be modified for the desired output? I attached the html source code and highlighted the information in green. Thank you very much.

Desired Output:

OMIM Gene
608222 ADSL
300806 AFF2
300382 ARK

Corona688 · March 25, 2014, 2:50pm

Once again the program you've posted has absolutely nothing to do with the input or output you want. We can't "fix" this, we'd have to totally rewrite it from scratch.

Not to mention that HTML isn't trivial. You generally can't extract the parts you want with just a regex match. Also like before.

cmccabe · March 25, 2014, 2:59pm

I am very unfamiliar with html and do not know the best way to accomplish.

going to a specific site (Emory Genetics Lab)
going to the Test Description tab
copy and paste into a new document the OMIM and gene information (highlighted text)

I apologize and thank you.

Corona688 · March 25, 2014, 4:02pm

Yet again, all the content is in one page and you don't have to add anything to the URL to get it.

$ cat xmls.awk
BEGIN {
        DEP=2;  # How many close tags in a row before data dump
        POS=0
        RS="<";
        ORS="<";
        FS="[ \n\t\t>/]";
        SUBSEP="^"
        print ""
}

# Always this finicky case when RS isn't \n
(NR==1) && (length($0) == 0) { next }

{ LINE=$0 }

# Skip XML comments
/^!--/ {
        while(!(I=index($0, "-->"))) if(getline <= 0) exit;
        # Strip out comment
        $0="--XMLCOMMENT-- />"substr($0,I+3);
}

# Ignore XML specification junk
/^\?/ || /^\!/ { next }

# These should be special variables for match() but aren't.
# String before match
function rbefore(STR)   { return(substr(STR, N, RSTART-1)); }
# First char of match
function rmid(STR)      { return(substr(STR, RSTART, 1)); }
# Entire match
function rall(STR)      { return(substr(STR, RSTART, RLENGTH)); }
# String after match
function rafter(STR)    { return(substr(STR, RSTART+RLENGTH)); }

# Turns Q SUBSEP R into A[Q]=R
function aquote(OUT, A, PFIX, TA) {
        if(OUT)
        {
                if(PFIX) PFIX=PFIX":"
                split(OUT, TA, SUBSEP);
                A[toupper(PFIX) toupper(TA[1])]=TA[2];
        }

        return("");
}

# Intended to be less stupid about quoted text in XML/HTML.
# Splits a='b' c='d' e='f' into A[a]=b, A[c]=d, A[e]=f, etc.
function qsplit(STR, A, PFIX, X, OUT) {
        while(STR && match(STR, /([ \n\t]+)|[\x27\x22=]/))
        {
                OUT = OUT rbefore(STR);

                RMID=rmid(STR);
                if((RMID == "'") || (RMID == "\""))     # Quote characters
                {
                        if(!Q)          Q=RMID;         # Begin quote section
                        else if(Q == RMID)      Q="";   # End quote section
                        else                    OUT = OUT RMID; # Quoted quote
                } else if(RMID == "=") {
                        if(Q)   OUT=OUT RMID; else OUT=OUT SUBSEP;
                } else if((RMID=="\r")||(RMID=="\n")||(RMID=="\t")||(RMID==" ")) {
                        if(Q)   OUT = OUT rall(STR); # Literal quoted whitespace
                        else    OUT = aquote(OUT, A, PFIX); # Unquoted WS, next block
                }
                STR=rafter(STR); # Strip off the text we've processed already.
        }

        aquote(OUT STR, A, PFIX); # Process any text we haven't already.
}

# Call before increment
function addprop(AIN,X,S) {
#       printf("Setting PROPDIR[...,%d]\n", POS+1);
        for(X in AIN)
        {
                S=S SUBSEP X;
                PROPDIR[X]++;
                PROPDIR[PROPDIR[X],X]=AIN[X];
                PROP[X]=PROPDIR[PROPDIR[X],X];
        }

        PROPDIR[POS+1]=S
}

# Call before decrement
function delprop(TA, N, M,X) {

        # List of properties added THIS stack
        M=split(PROPDIR[POS], TA, SUBSEP);
        delete PROPDIR[POS];

        for(N=2; N<=M; N++)
        {
                X=TA[N]

                # Clean up
                delete PROPDIR[PROPDIR[X],X];

                # Property from some previous stack
                if(--PROPDIR[X]) # Property still exists
                        PROP[X]=PROPDIR[PROPDIR[X],X];
                else    delete PROP[X];
        }
}

# Non-close tag
!/^\// {
        POP=0;

        TAG=$1;                         sub(/^[^ \r\n\t>/]*/, "");
        match($0, /\/?>/);
        TDATA=rbefore($0);              CDATA=rafter($0);

        # Flatten and strip whitespace
        gsub(/[ \r\n\t]+/, " ", CDATA);
        gsub(/^[ \r\n\t]+/, "", CDATA); gsub(/[ \r\n\t]+$/, "", CDATA);

        for(X in TA) delete TA[X];
        qsplit(TDATA, TA, TAG);
        if(length(CDATA))
                TA[toupper(TAG)":""CDATA"]=CDATA

        addprop(TA);

        if(RLENGTH != 2) # Found > instead of self-closing />
        {
#               printf("%s+%s\n", TSS, toupper(TAG));
                TS[++POS]=toupper(TAG);
                TSS=TSS"/"toupper(TAG);
        }



#       for(X in A) printf("%s[%s]=%s\n", TAG, X, A[X]);
}

# Close tags
/^\// {
        for(TPOS=POS; (TPOS>0) && (toupper($2) != TS[TPOS]); TPOS--);

        if(TPOS <= 0) { } # print "Went under for "$2" pos="POS
        else
        {
                TPOS--;
                while(TPOS < POS)
                {
                        delprop();
                        sub(/\/[^\/]*$/, "", TSS); POS--;
#                       printf("%s-%s\n", TSS, toupper($2));
                }
        }
}

# Every time we find a DIV with class 'tab-content', increment TAB one.
TA["DIV:CLASS"] == "TabbedPanelsContent tab-content" {  TAB++;  }
# Whenever CLASS is tab-content, and TAB is 4, print all lines we get
(PROP["DIV:CLASS"] == "TabbedPanelsContent tab-content") && (TAB==4) { print LINE }

$ awk -f xmls.awk 

$ awk -f xmls.awk aut.html
<div class="TabbedPanelsContent tab-content" style = 'padding:15px'>
    <!--TEST DESCRIPTION-->
    <div id = 'test-header'>Condition Description</div>
    <div id = "test-content"> <span style="font-weight: bold;">Genetics of Autism Spectrum Disorders</span><br />Autism spectrum disorders (ASDs) are a group of neurodevelopmental disorders which include autism, pervasive developmental delay-not otherwise specified (PDD-NOS), and Asperger syndrome. ASDs are characterized by impairments in social relationships, variable degrees of language and communication deficits, and repetitive behaviors and/or a narrow range of interests. The age of onset is prior to age 3 with a variable clinical presentation, ranging in severity both amongst individuals as well as amongst the various subtypes of ASDs. Additional clinical features may also be observed in individuals with an ASD, such as intellectual disability (up to ~50%) and seizures (~25%).<br /><br />Known genetic causes of autism include cytogenetically visible chromosome abnormalities (3-5%), copy number variants � which include submicroscopic deletions and duplications (~6-7%), and single gene disorders (~5%). <br /><br />Emory Genetics Laboratory�s integrated testing strategy allows for a comprehensive cytogenetics, metabolic, and molecular analysis of ASD in your patient. For a summary of autism testing at EGL, please click <a target="_blank" href="/egl/featuredtests/index.php/1643">here</a>.<br /><br />*Please note that some genes on this panel are associated with additional phenotypes.<br /><br /><span style="font-weight: bold;">All components of the Autism Panel can be ordered separately.</span><br /><br />References:<br />
<ul>
  <li>Autism and Developmental Disabilities Monitoring Network Surveillance Year 2006 Principal Investigators and the CDC (2009). <i>MMWR Surveill Summ</i>, 58:1-20.</li>
  <li>Bolton <span style="font-style: italic;">et al.</span> (2011). <i>Br J Psychiatry</i>, 198:289-294.</li>
  <li>Vorstman <span style="font-style: italic;">et al.</span> (2006) <i>Mol Psych</i>, 11:18-28.</li>
  <li>Shen <span style="font-style: italic;">et al.</span> (2010).<i> Pediatrics</i>, 125:e727-35. </li>
  <li>Miles JH (2011). <i>Genet Med</i>, 13:278-94.</li>
  <li>Schaefer <span style="font-style: italic;">et al.</span> (2008). <i>Genet Med</i>, 10:301-5. </li>
</ul></div>
     <div id = 'test-header'>Genes</div><div id = 'test-content' style = 'margin-bottom:10px;'><a href = 'http://www.omim.org/entry/608222' target = '_blank'>ADSL</a>, <a href = 'http://www.omim.org/entry/300806' target = '_blank'>AFF2</a>, <a href = 'http://www.omim.org/entry/300629' target = '_blank'>AP1S2</a>, <a href = 'http://www.omim.org/entry/300382' target = '_blank'>ARX</a>, <a href = 'http://www.omim.org/entry/300032' target = '_blank'>ATRX</a>, <a href = 'http://www.omim.org/entry/614901' target = '_blank'>BCKDK</a>, <a href = 'http://www.omim.org/entry/164757' target = '_blank'>BRAF</a>, <a href = 'http://www.omim.org/entry/114205' target = '_blank'>CACNA1C</a>, <a href = 'http://www.omim.org/entry/300172' target = '_blank'>CASK</a>, <a href = 'http://www.omim.org/entry/300203' target = '_blank'>CDKL5</a>, <a href = 'http://www.omim.org/entry/608892' target = '_blank'>CHD7</a>, <a href = 'http://www.omim.org/entry/604569' target = '_blank'>CNTNAP2</a>, <a href = 'http://www.omim.org/entry/600140' target = '_blank'>CREBBP</a>, <a href = 'http://www.omim.org/entry/602858' target = '_blank'>DHCR7</a>, <a href = 'http://www.omim.org/entry/300377' target = '_blank'>DMD</a>, <a href = 'http://www.omim.org/entry/607001' target = '_blank'>EHMT1</a>, <a href = 'http://www.omim.org/entry/300546' target = '_blank'>FGD1</a>, <a href = 'http://www.omim.org/entry/309550' target = '_blank'>FMR1</a>, <a href = 'http://www.omim.org/entry/136430' target = '_blank'>FOLR1</a>, <a href = 'http://www.omim.org/entry/164874' target = '_blank'>FOXG1</a>, <a href = 'http://www.omim.org/entry/605515' target = '_blank'>FOXP1</a>, <a href = 'http://www.omim.org/entry/605317' target = '_blank'>FOXP2</a>, <a href = 'http://www.omim.org/entry/ ' target = '_blank'>GABRB3</a>, <a href = 'http://www.omim.org/entry/308000' target = '_blank'>HPRT1</a>, <a href = 'http://www.omim.org/entry/314690' target = '_blank'>KDM5C</a>, <a href = 'http://www.omim.org/entry/308840' target = '_blank'>L1CAM</a>, <a href = 'http://www.omim.org/entry/611472' target = '_blank'>MBD5</a>, <a href = 'http://www.omim.org/entry/300005' target = '_blank'>MECP2</a>, <a href = 'http://www.omim.org/entry/300188' target = '_blank'>MED12</a>, <a href = 'http://www.omim.org/entry/600662' target = '_blank'>MEF2C</a>, <a href = 'http://www.omim.org/entry/300552' target = '_blank'>MID1</a>, <a href = 'http://www.omim.org/entry/300457' target = '_blank'>NHS</a>, <a href = 'http://www.omim.org/entry/608667' target = '_blank'>NIPBL</a>, <a href = 'http://www.omim.org/entry/300336' target = '_blank'>NLGN3</a>, <a href = 'http://www.omim.org/entry/300427' target = '_blank'>NLGN4X</a>, <a href = 'http://www.omim.org/entry/603881' target = '_blank'>NR1I3</a>, <a href = 'http://www.omim.org/entry/600565' target = '_blank'>NRXN1</a>, <a href = 'http://www.omim.org/entry/606681' target = '_blank'>NSD1</a>, <a href = 'http://www.omim.org/entry/300127' target = '_blank'>OPHN1</a>, <a href = 'http://www.omim.org/entry/601545' target = '_blank'>PAFAH1B1</a>, <a href = 'http://www.omim.org/entry/300460' target = '_blank'>PCDH19</a>, <a href = 'http://www.omim.org/entry/300414' target = '_blank'>PHF6</a>, <a href = 'http://www.omim.org/entry/605610' target = '_blank'>PNKP</a>, <a href = 'http://www.omim.org/entry/300463' target = '_blank'>PQBP1</a>, <a href = 'http://www.omim.org/entry/300828' target = '_blank'>PTCHD1</a>, <a href = 'http://www.omim.org/entry/ ' target = '_blank'>PTEN</a>, <a href = 'http://www.omim.org/entry/176876' target = '_blank'>PTPN11</a>, <a href = 'http://www.omim.org/entry/300774' target = '_blank'>RAB39B</a>, <a href = 'http://www.omim.org/entry/607642' target = '_blank'>RAI1</a>, <a href = 'http://www.omim.org/entry/600514' target = '_blank'>RELN</a>, <a href = 'http://www.omim.org/entry/182389' target = '_blank'>SCN1A</a>, <a href = 'http://www.omim.org/entry/138140' target = '_blank'>SLC2A1</a>, <a href = 'http://www.omim.org/entry/300231' target = '_blank'>SLC9A6</a>, <a href = 'http://www.omim.org/entry/601607' target = '_blank'>SMARCB1</a>, <a href = 'http://www.omim.org/entry/300040' target = '_blank'>SMC1A</a>, <a href = 'http://www.omim.org/entry/602272' target = '_blank'>TCF4</a>, <a href = 'http://www.omim.org/entry/312180' target = '_blank'>UBE2A</a>, <a href = 'http://www.omim.org/entry/601623' target = '_blank'>UBE3A</a>, <a href = 'http://www.omim.org/entry/607817' target = '_blank'>VPS13B</a>, <a href = 'http://www.omim.org/entry/605802' target = '_blank'>ZEB2</a></div><div id = 'test-header'>Methodology</div><div id = 'test-content'><span style="font-weight: bold;">Next Generation Sequencing:</span>�� In solution hybridization of all coding exons contained in the genes of the Autism Panel is performed on the patient's genomic DNA. Direct sequencing of the amplified captured regions is performed using next generation sequencing. The patient's gene sequences are then compared to a standard reference sequence. Potentially causative variants and areas of low coverage are Sanger sequenced in order to confirm variants and ensure 100% coverage of the targeted exons. Sequence variations are classified as pathogentic variants, benign variants unrelated to disease, or variants of unknown clinical significance. Variants of unknown clinical significance may require further studies of the patient and/or family members. This assay does not interrogate the promoter region, deep intronic regions, or other regulatory elements, and does not detect single or multi exon deletions or duplications.<br /><br /><br /><span style="font-weight: bold;">Deletion/Duplication Analysis:�</span> DNA isolated from peripheral blood is hybridized to a gene-targeted CGH array to detect deletions and duplications. The targeted CGH array has overlapping probes that cover the entire genomic region.<br /><br /><span style="font-weight: bold;">FRAXE:</span>� Alleles in the normal and premutation range are detected by PCR amplification and fragment analysis.� Males: Methylation sensitive PCR is performed to determine <i>AFF2</i>/ <i>FMR2</i> methylation status. Females: Southern blot analysis is performed to detect expanded alleles of the <i>FMR2</i> gene.</div><div id = 'test-header'>Detection</div><div id = 'test-content'><span style="font-weight: bold;">Next Generation Sequencing:</span>� Clinical Sensitivity: Unknown. Mutations in the promoter region, some mutations in the introns and other regulatory element mutations cannot be detected by this analysis. Large deletions/duplications will not be detected by this analysis. Results of molecular analysis should be interpreted in the context of the patient's clinical/biochemical phenotype. <br />Analytical Sensitivity: ~99% <br /><br /><span style="font-weight: bold;">Deletion/Duplication Analysis:</span>� Detection is limited to duplications and deletions. The CGH array will not detect point or intronic mutations. Results of molecular analysis must be interpreted in the context of the patient's clinical and/or biochemical phenotype.<br /><br /><span style="font-weight: bold;">FRAXE:</span>� All cases of FRAXE syndrome caused by CCG expansion of the <i>AFF2</i>/<i>FMR2</i> gene will be detected by this assay. Rare cases of FRAXE syndrome caused by mutation of the AFF2/FMR2 gene will not be detected by this assay.</div>    <!---->

$