FileA
chr1 31237964 NP_001018494.1 PUM1 M340L
chr1 31237964 NP_055491.1 PUM1 M340L
chr1 33251518 NP_037543.1 AK2 H191D
chr1 33251518 NP_001616.1 AK2 H191D
chr1 57027345 NP_001004303.2 C1orf168 P270S
FileB
chr1 116944164 NP_001533.2 IGSF3 R671W
chr1 33251518 NP_001616.1 AK2 H191D
chr1 57027345 NP_001004303.2 C1orf168 P270S
chr1 89606840 NP_940862.2 GBP6 R48C
chr1 110751878 NP_006393.2 HBXIP P45L
chr1 246803952 NP_001001821.1 OR2T34 A244T
FileC
chr1 17164810 NP_055490.3 CROCC G1471R
chr1 36323375 NP_055281.2 TEKT2 R61G
chr1 89606840 NP_940862.2 GBP6 R48C
chr1 40302534 NP_006358.1 CAP1 V115L
chr1 33251518 NP_001616.1 AK2 H191D
chr1 62026171 NP_795352.2 INADL P336H
FileD
chr1 116944223 NP_001533.2 IGSF3 S651I
chr1 116944223 NP_001007238.1 IGSF3 S631I
chr1 150394079 XP_001724459.1 RPTN E707G
chr1 36323375 NP_055281.2 TEKT2 R61G
chr1 150547095 NP_002007.1 FLG E2297D
chr1 172075300 NP_060592.2 DARS2 G338E
chr1 222620225 NP_054903.1 CNIH4 G54S
I want a script (awk preferably or python) that will look for common lines in the 4 different files. Files are sorted on Col1, but can be resorted if necessary.
I want to have three output files
1) Commonlines in all 4 files
2) Common lines in any 3 files
3) Common lines in any 2 files. Getting which files have the common-line would be nice too.
Kindly help
~GH
Possibly a point to start from:
[house@leonov] sed -i fileA -e 's/.*$/& \t fileA/g'
[house@leonov] sed -i fileB -e 's/.*$/& \t fileB/g'
[house@leonov] cat fileA fileB >> common
[house@leonov] sort common
chr1 110751878 NP_006393.2 HBXIP P45L fileB
chr1 116944164 NP_001533.2 IGSF3 R671W fileB
chr1 246803952 NP_001001821.1 OR2T34 A244T fileB
chr1 31237964 NP_001018494.1 PUM1 M340L fileA
chr1 31237964 NP_055491.1 PUM1 M340L fileA
chr1 33251518 NP_001616.1 AK2 H191D fileA
chr1 33251518 NP_001616.1 AK2 H191D fileB
chr1 33251518 NP_037543.1 AK2 H191D fileA
chr1 57027345 NP_001004303.2 C1orf168 P270S fileA
chr1 57027345 NP_001004303.2 C1orf168 P270S fileB
chr1 89606840 NP_940862.2 GBP6 R48C fileB
Given your sample data, the following script:
awk 'END {
for (R in rec) {
n = split(rec[R], t, "/")
if (n > 1)
dup[n] = dup[n] ? dup[n] RS sprintf("\t%-20s -->\t%s", rec[R], R) : \
sprintf("\t%-20s -->\t%s", rec[R], R)
}
for (D in dup) {
printf "records found in %d files:\n\n", D
printf "%s\n\n", dup[D]
}
}
{
rec[$0] = rec[$0] ? rec[$0] "/" FILENAME : FILENAME
}' file[a-d]
Outputs:
records found in 2 files:
filea/fileb --> chr1 57027345 NP_001004303.2 C1orf168 P270S
fileb/filec --> chr1 89606840 NP_940862.2 GBP6 R48C
filec/filed --> chr1 36323375 NP_055281.2 TEKT2 R61G
records found in 3 files:
filea/fileb/filec --> chr1 33251518 NP_001616.1 AK2 H191D
1 Like
Guess there are no duplicate lines in same files.
2 means from 2 files, 3 means from 3 files.
$ sort File* |uniq -c |sort -n
1 chr1 110751878 NP_006393.2 HBXIP P45L
1 chr1 116944164 NP_001533.2 IGSF3 R671W
1 chr1 116944223 NP_001007238.1 IGSF3 S631I
1 chr1 150394079 XP_001724459.1 RPTN E707G
1 chr1 150547095 NP_002007.1 FLG E2297D
1 chr1 17164810 NP_055490.3 CROCC G1471R
1 chr1 172075300 NP_060592.2 DARS2 G338E
1 chr1 222620225 NP_054903.1 CNIH4 G54S
1 chr1 246803952 NP_001001821.1 OR2T34 A244T
1 chr1 31237964 NP_001018494.1 PUM1 M340L
1 chr1 31237964 NP_055491.1 PUM1 M340L
1 chr1 33251518 NP_037543.1 AK2 H191D
1 chr1 40302534 NP_006358.1 CAP1 V115L
1 chr1 62026171 NP_795352.2 INADL P336H
1 ichr1 116944223 NP_001533.2 IGSF3 S651I
2 chr1 36323375 NP_055281.2 TEKT2 R61G
2 chr1 57027345 NP_001004303.2 C1orf168 P270S
2 chr1 89606840 NP_940862.2 GBP6 R48C
3 chr1 33251518 NP_001616.1 AK2 H191D
Just to clarify that I wrote all that code only because of this requirement:
1 Like
Dear Radoulov,
That worked perfectly well.. exactly as I wanted!
I would like to know if this script is extensible for say a hundred such files?
Also if the files are names differently; not file[a-d], how will the code change.
Could you also give a brief explanation if time permits.
Sincere thanks
~GH
If you want to generalize this works for finding common lines that occur in any n files:
filecnt=$( find /directory/to/files -type f )
awk -v n=$filecnt '
{arr[$0]++; next}
END{for (i in arr) {
if(arr==n) {
print arr
}
} ' $( find /directory/to/files -type f ) > outputfil
change the value of n to be less than the number of files or whatever you need. Note that if you files are hundreds of MB's each you will probably run out of virtual memory if you try this on a large numbers of files....
Yes,
in this case the number of arguments (input files) is limited only by your system (MAX_ARGS kernel limit).
As far as the script is concerned, the filenames are irrelevant,
just pass the input files as arguments:
awk 'END {
for (R in rec) {
n = split(rec[R], t, "/")
if (n > 1)
dup[n] = dup[n] ? dup[n] RS sprintf("\t%-20s -->\t%s", rec[R], R) : \
sprintf("\t%-20s -->\t%s", rec[R], R)
}
for (D in dup) {
printf "records found in %d files:\n\n", D
printf "%s\n\n", dup[D]
}
}
{
rec[$0] = rec[$0] ? rec[$0] "/" FILENAME : FILENAME
}' <any_filename_1> <any_filename_2> ... <any_filename_n>
I'll post the explanation later.
2 Likes
This does not work correctly.
The file results show
fileA/fileA/fileA/fileA/fileA --> chr1
in my actual data.
The sample worked.. and I know there are no duplicate lines because atleast one filed in the everyline is different from others.
Please help
~GH
... so you should post samples of your real data files.
I've been asked to provide an explanation of the awk code:
awk 'END {
# the END block is executed after
# all the input has been read
# loop over the rec array
# and build the dup array indxed by the nuber of
# filenames containing a given record
for (R in rec) {
n = split(rec[R], t, "/")
if (n > 1)
dup[n] = dup[n] ? dup[n] RS sprintf("\t%-20s -->\t%s", rec[R], R) : \
sprintf("\t%-20s -->\t%s", rec[R], R)
}
# loop over the dup array
# and report the number and the names of the files
# containing the record
for (D in dup) {
printf "records found in %d files:\n\n", D
printf "%s\n\n", dup[D]
}
}
{
# build an array named rec (short for record), indexed by
# the content of the current record ($0), concatenating
# the filenames separated by / as values
rec[$0] = rec[$0] ? rec[$0] "/" FILENAME : FILENAME
}' file[a-d]