This is a "work around" for handling arbitrary XML in shell, my yanx.awk library:
BEGIN {
FS=">"; OFS=">";
RS="<"; ORS="<"
}
# After match("qwertyuiop", /rty/)
# rbefore("qwertyuiop") is "qwe",
# rmid("qwertyuipo") is "r"
# rall("qwertyuiop") is "rty"
# rafter("qwertyuiop") is "uiop"
function rbefore(STR) { return(substr(STR, N, RSTART-1)); }# before match
function rmid(STR) { return(substr(STR, RSTART, 1)); } # First char match
function rall(STR) { return(substr(STR, RSTART, RLENGTH)); }# Entire match
function rafter(STR) { return(substr(STR, RSTART+RLENGTH)); }# after match
function aquote(OUT, A, PFIX, TA) { # Turns Q SUBSEP R into A[PFIX":"Q]=R
if(OUT)
{
if(PFIX) PFIX=PFIX":"
split(OUT, TA, SUBSEP);
A[toupper(PFIX) toupper(TA[1])]=TA[2];
}
return("");
}
# Intended to be less stupid about quoted text in XML/HTML.
# Splits a='b' c='d' e='f' into A[PFIX":"a]=b, A[PFIX":"c]=d, etc.
function qsplit(STR, A, PFIX, X, OUT) {
while(STR && match(STR, /([ \n\t]+)|[\x27\x22=]/))
{
OUT = OUT rbefore(STR);
RMID=rmid(STR);
if((RMID == "'") || (RMID == "\"")) # Quote characters
{
if(!Q) Q=RMID; # Begin quote section
else if(Q == RMID) Q=""; # End quote section
else OUT = OUT RMID; # Quoted quote
} else if(RMID == "=") {
if(Q) OUT=OUT RMID; else OUT=OUT SUBSEP;
} else if((RMID=="\r")||(RMID=="\n")||(RMID=="\t")||(RMID==" ")) {
if(Q) OUT = OUT rall(STR); # Literal quoted whitespace
else OUT = aquote(OUT, A, PFIX); # Unquoted WS, next block
}
STR=rafter(STR); # Strip off the text we've processed already.
}
aquote(OUT STR, A, PFIX); # Process any text we haven't already.
}
{ SPEC=0 ; TAG="" }
NR==1 {
if(ORS == RS) print;
next } # The first "line" is blank when RS=<
/^[!?]/ { SPEC=1 } # XML specification junk
# Handle open-tags
match($1, /^[^\/ \r\n\t>]+/) {
CTAG=""
TAG=substr(toupper($1), RSTART, RLENGTH);
if((!SPEC) && !($1 ~ /\/$/))
{
TAGS=TAG "%" TAGS;
DEP++;
LTAGS=TAGS
}
for(X in ARGS) delete ARGS[X];
qsplit(rafter($1), ARGS, "", "", "");
}
# Handle close-tags
(!SPEC) && /^[\/]/ {
sub(/^\//, "", $1);
LTAGS=TAGS
CTAG=toupper($1)
# sub("^.*" toupper($1) "%", "", TAGS);
sub("^" toupper($1) "%", "", TAGS);
$1="/"$1
DEP=split(TAGS, TA, "%")-1;
if(DEP < 0) DEP=0;
}
And here is how you use it, html.awk :
BEGIN { ORS="" ; OFS="" ; LINK=1; COL=0 }
# Print a column of data in CSV format
function csv(S) {
gsub(/[,"]/, "\\\\&", S); printf("%s\"%s\"", OFS, S); OFS=","
}
# When a table row starts, or the entire table ends, print row
(TAG=="TR" || CTAG=="TABLE") && COL {
OFS=""
# Print current row if any
for(C=1; C<=COL; C++) { csv(DATA[C]); delete DATA[C]; }
for(C=1; C in LINKS; C++) { csv(LINKS[C]); delete LINKS[C]; }
printf("\n");
COL=0; LINK=1 # Reset indexes for arrays
}
# Count colums in table. count em as a row to separate date comment
TAG=="TD" || TAG=="EM" { COL++ }
# Clean up HTML garbage
{ gsub(/([|])|([ \r\n\t]+)|(�)/, " ", $2); }
# Collect attachments when found
TAGS ~ /TABLE/ && ARGS["HREF"] {
sub(/.*[/]/, "", ARGS["HREF"]);
LINKS[LINK++]=ARGS["HREF"];
delete ARGS["HREF"];
next # Skip to next tags, we dont want link title
}
# Append text to the current row and col
TAGS ~ /(^|%)TD%/ && !($2 ~ /^[ \r\n\t]+$/) {DATA[COL] = DATA[COL] $2 }
And here is how you run it:
$ awk -f yanx.awk -f html.awk input.html
"AA Number. 3-456","The quick brown fox jumps over the lazy dog near the bank of the river. The quick brown fox jumps over the lazy dog near the bank of the river.","(Hello World May 20\, 2016)","May 18\, 2016","abcd.pdf","abcfull.pdf"
"BB Number. 7-890","The quick brown fox jumps over the lazy dog near the bank of the river1.The quick brown fox jumps over the lazy dog near the bank of the river2.","(Lord of the rings May 30\, 2016)","May 28\, 2016","efghi.pdf","efghifull.pdf","efghisum.pdf"
$
Use nawk on solaris.