Yes. My 2nd question was whether the output is sampling with or without replacement. And, Xterra has responded that each output set is to use sampling without replacement.
My 1st question was whether the random sequences 1, 3, 5 and 1, 5, 3 (and the six other orders of those three values) are to be treated as 8 distinct output sequences or they should all be normalized to the single sequence where the output lines are in the same order as they were in the input files. That question hasn't been answered yet, so I'm assuming they are to be treated as distinct sequences. (I expect to have an awk program with a shell wrapper to set options later today unless someone else comes up with working solution 1st.)
---------- Post updated at 05:45 ---------- Previous update was at 01:24 ----------
I believe this script does what was requested with considerable (although not complete) error checking. I use the Korn shell (and tested this script using it), but this should work with any POSIX conforming shell. If you are using a Solaris/SunOS system, use /usr/xpg4/bin/awk
or nawk
instead of awk
:
#!/bin/ksh
# SYNOPSIS
# rso [-l line_count] [-s sequence_count] fileA fileB
# DESCRIPTION
# The rso utility "r"andomly "s"elects "sequence_count" (default 10)
# sequences of "line_count" (default 3) corresponding lines from "fileA"
# and "fileB" and "o"utputs those lines to files with names:
# Outfile1-X and Outfile2-X
# X is a sequence number ranging from 1 through "sequence_count".
# Leading zeros will be added to the sequence number, if needed, to make
# all output filenames be the same length. Lines in Outfile1-* will be
# from "fileA" and the corresponding lines in Outfile2-* will be the
# corresponding lines from "fileB".
#
# Even though the output lines selected will be randomly selected, no
# input line will be output more than once in a given sequence.
#
# The number of lines in "fileA" and "fileB" must be the same and the
# number of lines in the files must be greater than or equal to
# "line_count".
# Set defaults
ec=0 # Set error code (0 -> no error)
lc=3 # Set default line_count
sc=10 # Set default sequence_count
sn=$(basename $0) # Save script name for diagnostics
# Process command line options
while getopts l:s: opt
do case $opt in
(l) lc="$OPTARG";;
(s) sc="$OPTARG";;
(?) ec=1;;
esac
done
shift $((OPTIND - 1))
# Verify # of operands
if [ $# -ne 2 ]
then printf "%s: 2 operands are required; %d found\n" "$sn" $#
ec=1
fi
# If we found errors or the awk script detects an error, print a usage message
if [ $ec -ne 0 ] || ! awk -v lc="$lc" -v sc="$sc" -v sn="$sn" '
# Verify that "line_count" and "sequence_count" are positive integer values
BEGIN { if(lc !~ /^[[:digit:]]+$/ || lc < 1) {
printf("%s: line_count (%s) must be a positive integer\n",
sn, lc)
ec = 1
exit ec
}
if(sc !~ /^[[:digit:]]+$/ || sc < 1) {
printf("%s: sequence_count (%s) must be a positive integer\n",
sn, sc)
ec = 2
exit ec
}
}
# Save input file names for diagnostics
FNR == 1 {
fn[++fc] = FILENAME
}
# Accumulate and count input lines from both input files
{ if(FNR == NR) f1[++c1] = $0
else f2[++c2] = $0
}
END { # If we got here due to an earlier detected error, get out now
if(ec) exit ec
# Verify that both files contain the same numbe of lines and that
# line_count <= # of lines in the files
if(c1 != c2) {
printf("%s: lines in %s (%d) must equal lines in %s (%d).\n",
sn, fn[1], c1, fn[2], c2)
exit 3
}
if(c1 < lc) {
printf("%s: line_count(%d) must be <= # of lines in files(%d)\n",
sn, lc, c1)
exit 4
}
# Produce output sequences
for(i = 1; i <= sc; i++) {
# Set output file names
of1 = sprintf("Outfile1-%0*d", length(sc), i)
of2 = sprintf("Outfile2-%0*d", length(sc), i)
# Set random list of line_count line numbers to output for this
# output sequence
for(j = 1; j <= lc; j++) {
# Find line_count distinct line numbers
while((k = int(rand() * c1) + 1) in list) continue
list[k]
}
# Print corresponding pairs of lines from the input files into
# the output files and delete the printed line number from the
# list of selected random numbers.
for(j in list) {
print f1[j] > of1
print f2[j] > of2
delete list[j]
}
# Close the output files created for this sequence
close(of1)
close(of2)
}
}' "$1" "$2" >&2
then printf "Usage: %s [-l line_count] [-s sequence_count] fileA fileB\n" \
"$sn" >&2
exit 1
fi