Parsing XML isn't trivial, but we get asked for it all the time, so:
# yanx.awk v0.0.8, Tyler Montbriand, 2017. Yet another noncompliant XML parser
###############################################################################
# XML is a pain to process in the shell, but people need it all the time.
# I've been using and improving this kludge since 2014 or so. It parses and
# stacks tags and digests parameters, allowing simple XML processing and
# extraction to be managed with a handful of lines addendum.
#
# I've restricted my use of GNU features enough that this script will run on
# busybox's awk. I think it works with mawk except -e is unsupported.
# You can work around that by running multiple files, i.e.
# mawk -f yanx.awk -f mystuff.awk inputfile
###############################################################################
# Basic use:
#
# Fed this XML, <body><html a="b">Your Web Browser Hates This</html></body>
# yanx will read it token-by-token as so:
# Line 1: Empty, skipped
# Line 2: $1="body"
# Line 3: $1="html a="b"", $2="Your web browser hates this"
# Line 4: $1="/html"
# Line 5: $1="/body", $2="\n"
#
# The script sets a few new "special" variables along the way.
# TAG The name of the current tag, uppercased.
# CTAG If close-tag, name in uppercase.
# TAGS List of nested tags, like HTML%BODY%, including current tag
# LTAGS List of nested tags, not including current tag
# ARGS Array of tag parameters, uppercased. i.e. ARGS["HREF"]
# DEP How many tags deep it's nested, including current tag.
#
###############################################################################
# Examples:
# # Rewrite cdata of all divs
# awk -f yanx.awk -e 'TAGS ~ /^DIV%/ { $2="quux froob" } 1' input
# # Extract href's from every link
# awk -f yanx.awk -e 'TAGS~/^A%/ && ("HREF" in ARGS) {
# print ARGS["HREF"] }' ORS="\n" input
###############################################################################
# Known Bugs:
# A short XML script can't possibly handle DOD, etc. Entities a la <
# are not translated either.
#
# I've done my best to make it swallow <!--, <? ?> and other such fancy
# XML syntax without choking, but that doesn't mean it handles them
# properly either.
#
# It's an XML parser, not an HTML parser. It probably won't swallow a
# wild-from-the internet HTML web page without some cleanup first:
# javascript, tags inside comments, etc will be mangled instead of ignored.
#
# Last: Because of its design, when printing raw HTML, yanx adds an extra <
# to the end of the file. This is because < belongs at the beginning of
# a token but awk is told it's printed at the end. There is no equivalent
# "line prefix" variable that I know of, if you want it to print smarter
# you'll have to print the <'s yourself, by setting ORS=" and
# printing lines like print "<" $0
###############################################################################
BEGIN {
FS=">"; OFS=">";
RS="<"; ORS="<"
}
# After match("qwertyuiop", /rty/)
# rbefore("qwertyuiop") is "qwe",
# rmid("qwertyuipo") is "r"
# rall("qwertyuiop") is "rty"
# rafter("qwertyuiop") is "uiop"
# !?!?!
# function rbefore(STR) { return(substr(STR, N, RSTART-1)); }# before match
function rbefore(STR) { return(substr(STR, 0, RSTART-1)); }# before match
function rmid(STR) { return(substr(STR, RSTART, 1)); } # First char match
function rall(STR) { return(substr(STR, RSTART, RLENGTH)); }# Entire match
function rafter(STR) { return(substr(STR, RSTART+RLENGTH)); }# after match
function aquote(OUT, A, PFIX, TA) { # Turns Q SUBSEP R into A[PFIX":"Q]=R
if(OUT)
{
if(PFIX) PFIX=PFIX":"
split(OUT, TA, SUBSEP);
A[toupper(PFIX) toupper(TA[1])]=TA[2];
}
return("");
}
# Intended to be less stupid about quoted text in XML/HTML.
# Splits a='b' c='d' e='f' into A[PFIX":"a]=b, A[PFIX":"c]=d, etc.
function qsplit(STR, A, PFIX, X, OUT) {
while(STR && match(STR, /([ \n\t]+)|[\x27\x22=]/))
{
OUT = OUT rbefore(STR);
RMID=rmid(STR);
if((RMID == "'") || (RMID == "\"")) # Quote characters
{
if(!Q) Q=RMID; # Begin quote section
else if(Q == RMID) Q=""; # End quote section
else OUT = OUT RMID; # Quoted quote
} else if(RMID == "=") {
if(Q) OUT=OUT RMID; else OUT=OUT SUBSEP;
} else if((RMID=="\r")||(RMID=="\n")||(RMID=="\t")||(RMID==" ")) {
if(Q) OUT = OUT rall(STR); # Literal quoted whitespace
else OUT = aquote(OUT, A, PFIX); # Unquoted WS, next block
}
STR=rafter(STR); # Strip off the text we've processed already.
}
aquote(OUT STR, A, PFIX); # Process any text we haven't already.
}
{ SPEC=0 ; TAG="" }
NR==1 {
if(ORS == RS) print;
next } # The first "line" is blank when RS=<
/^[!?]/ { SPEC=1 } # XML specification junk
# Handle open-tags
(!SPEC) && match($1, /^[^\/ \r\n\t>]+/) {
CTAG=""
TAG=substr(toupper($1), RSTART, RLENGTH);
if((!SPEC) && !($1 ~ /\/$/))
{
TAGS=TAG "%" TAGS;
DEP++;
LTAGS=TAGS
}
for(X in ARGS) delete ARGS[X];
qsplit(rafter($1), ARGS, "", "", "");
}
# Handle close-tags
(!SPEC) && /^[\/]/ {
sub(/^\//, "", $1);
LTAGS=TAGS
CTAG=toupper($1)
TAG=""
# sub("^.*" toupper($1) "%", "", TAGS);
sub("^" toupper($1) "%", "", TAGS);
$1="/"$1
DEP=split(TAGS, TA, "%")-1;
# Update TAG with tag on top of stack, if any
# if(DEP < 0) { DEP=0; TAG="" }
# else { TAG=TA[DEP]; }
}
You can use it with this:
# xmlsplit.awk
BEGIN {
ORS=""
X="x."
ROWS=5
}
# First pass, remember headers and footers
NR==FNR {
if(F || TAG == "FOOTER")
{
if(!F) {
FTRSTART=FNR
F=1
}
FTR=FTR "<" $1 OFS $2
}
else if((!H) && (TAG == "DOCUMENT"))
{
HDREND=FNR
H=1
}
else if(!H) HDR=HDR "<" $1 OFS $2
next
}
# Skip header and footer
(FNR < HDREND) || (FNR >= FTRSTART) { next }
# Close output file once enough DOCUMENT records
((XNR%(ROWS+1)) == 0) {
if(FILE) {
print FTR > FILE
close(FILE);
}
FILE=sprintf("%s%04d", X,++FILENUM);
print HDR > FILE
XNR++
}
{ print "<" $0 > FILE }
CTAG == "DOCUMENT" { XNR++ }
END { if(FILE) print FTR > FILE }
Like this:
# Yes, it's fed inputfile twice
awk -f yanx.awk -f xmlsplit.awk X="x." ROWS="10" input input
With this input:
<?xml version="1.0" encoding="UTF-8"?><Recipient>
<Header>
<tag1></tag1>
<tag2>1212233</tag2>
--
----
---
</Header>
<Document>001</Document>
<Document>002</Document>
<Document>003</Document>
<Document>004</Document>
<Document>005</Document>
<Document>006</Document>
<Document>007</Document>
<Document>008</Document>
<Document>009</Document>
<Document>010</Document>
<Document>011</Document>
<Document>012</Document>
<Document>013</Document>
<Document>014</Document>
<Document>015</Document>
<Document>016</Document>
<Document>017</Document>
<Document>018</Document>
<Document>019</Document>
<Document>020</Document>
<Document>021</Document>
<Document>022</Document>
<Document>023</Document>
<Document>024</Document>
<Document>025</Document>
<Document>026</Document>
<Document>027</Document>
<Document>028</Document>
<Document>029</Document>
<Document>030</Document>
<Document>031</Document>
<Document>032</Document>
<Document>033</Document>
<Document>034</Document>
<Document>035</Document>
<Document>036</Document>
<Document>037</Document>
<Document>038</Document>
<Document>039</Document>
<Document>040</Document>
<Document>041</Document>
<Document>042</Document>
<Document>043</Document>
<Document>044</Document>
<Document>045</Document>
<Document>046</Document>
<Document>047</Document>
<Document>048</Document>
<Document>049</Document>
<Document>050</Document>
<Document>051</Document>
<Document>052</Document>
<Document>053</Document>
<Document>054</Document>
<Document>055</Document>
<Document>056</Document>
<Document>057</Document>
<Document>058</Document>
<Document>059</Document>
<Document>060</Document>
<Document>061</Document>
<Document>062</Document>
<Document>063</Document>
<Document>064</Document>
<Document>065</Document>
<Document>066</Document>
<Document>067</Document>
<Document>068</Document>
<Document>069</Document>
<Document>070</Document>
<Document>071</Document>
<Document>072</Document>
<Document>073</Document>
<Document>074</Document>
<Document>075</Document>
<Document>076</Document>
<Document>077</Document>
<Document>078</Document>
<Document>079</Document>
<Document>080</Document>
<Document>081</Document>
<Document>082</Document>
<Document>083</Document>
<Document>084</Document>
<Document>085</Document>
<Document>086</Document>
<Document>087</Document>
<Document>088</Document>
<Document>089</Document>
<Document>090</Document>
<Document>091</Document>
<Document>092</Document>
<Document>093</Document>
<Document>094</Document>
<Document>095</Document>
<Document>096</Document>
<Document>097</Document>
<Document>098</Document>
<Document>099</Document>
<Document>100</Document>
<Footer>
---
--
</Footer>
To produce output like this:
$ cat x.0001
<?xml version="1.0" encoding="UTF-8"?><Recipient>
<Header>
<tag1></tag1>
<tag2>1212233</tag2>
--
----
---
</Header>
<Document>001</Document>
<Document>002</Document>
<Document>003</Document>
<Document>004</Document>
<Document>005</Document>
<Document>006</Document>
<Document>007</Document>
<Document>008</Document>
<Document>009</Document>
<Document>010</Document>
<Footer>
---
--
</Footer>
$ cat x.0010
<?xml version="1.0" encoding="UTF-8"?><Recipient>
<Header>
<tag1></tag1>
<tag2>1212233</tag2>
--
----
---
</Header>
<Document>091</Document>
<Document>092</Document>
<Document>093</Document>
<Document>094</Document>
<Document>095</Document>
<Document>096</Document>
<Document>097</Document>
<Document>098</Document>
<Document>099</Document>
<Document>100</Document>
<Footer>
---
--
</Footer>
$