Hello,
I want to loop thru a vector composed of many entries as structure, which contains sequenceID and sequence. At looping, delete any structure if the sequence is a perfect-match substring of another sequence of any other structure, so that the resulted vector contains only unique sequences.
I have difficulty for the parts to access the sequence as vector member by iterator, and to compare the sequences as structure member thru iterator.
#include <iostream>
#include <fstream>
#include <string.h> //Needed for strtok and strcpy
#include <string>
#include <cstring>
#include <vector>
#include <map>
#include <algorithm>
/****************************************************************************
* Try to remove all the redundant sequences in a multiple fasta file
* Redundant sequences are those including any substring of perfect-match
* from both the forward strand and the reverse complementary strand
*****************************************************************************/
using namespace std;
//Read the sequences into structure containing both strands and the length
typedef struct fasta {
string seqID;
string seq;
// string *seq_rc;
int seqLength;
} SEQ;
bool compareByLength(const SEQ & a, const SEQ & b)
{
if (a.seqLength == b.seqLength) {
return a.seq < b.seq;
}
else
return a.seqLength < b.seqLength;
// return (a.seq).size() < (b.seq).size(); //This is alternative way
}
int main(int argc, char *argv[])
{
ifstream inFILE;
inFILE.open(argv[1]);
SEQ entry;
vector < SEQ > seqSet;
string entryID;
map < string, string > FastaSeq;
if (inFILE.fail()) {
cout << "Error! input file failed to open!" << endl;
cout << "Usage: ./prog fasta.file" << endl;
return 1;
} else {
string line; // Get a single line to parse
char *sPtr;
while (inFILE.good()) {
getline(inFILE, line);
char *sArray = new char[line.length() + 1];
strcpy(sArray, line.c_str());
if (sArray[0] == '>') { /*For head row of each entry */
sPtr = strtok(sArray, " ");
entryID = sPtr + 1; //Skip the ">" symbol
entry.seqID = entryID; // copy the entryID to structure entry->seqID
continue; //Only need the first token for sequence ID
} //End of the header row
else { /* For sequence rows */
sPtr = strtok(sArray, " "); //Start to tokenize the line by space
while (sPtr != NULL) { /* starts a single row of sequence part */
FastaSeq[entryID] += sPtr;
sPtr = strtok(NULL, " ");
} //Ends a single row of sequence part
entry.seq = FastaSeq[entryID];
// cout << " ****** " << FastaSeq[entryID] << endl;
// seq_tmp = revcomp(sPtr);
// strcpy(entry->seq_rc, sPtr);
entry.seqLength = strlen(FastaSeq[entryID].c_str());
// cout << entry.seqLength << endl;
seqSet.push_back(entry); // push the new sequence info into vector
} //Ends all the sequence part of each entry
delete[]sArray; //Lambda? to release the memory
} //End the file
}
sort(seqSet.begin(), seqSet.end(), compareByLength);
vector < SEQ >::iterator itr1, itr2, itr3;
for (itr1 = seqSet.begin(); itr1 < seqSet.end(); itr1++) {
for (itr2 = itr1 + 1; itr2 < seqSet.end(); itr2++) {
if ((itr2->seq).find(itr1->seq)) { //Line 90
// cout << "Found substring!" << "\t\t"; //Line 90a
// cout << itr1->seqID << "\t"; //Line 90b
// cout << itr1->seq << "\t vs. \t"; //Line 90c
// cout << itr2->seqID << "\t"; //Line 90d
// cout << (*itr2).seq << endl; //Line 90e
seqSet.erase(itr1); //Line 91
}
}
}
//Printing the vector with iterator
for (itr3 = seqSet.begin(); itr3 != seqSet.end(); itr3++) {
cout << (*itr3).seqID << "\t"; //Or, itr3->seqID
cout << itr3->seq << "\t";
cout << itr3->seqLength << endl;
}
inFILE.close();
return 0;
}
Debugging for a while, finally code was compiled without error!
test.fasta
>seq1
ATCGATCGATATATATATATATAT
>seq2 sub of seq1
CGATCGATATATATATAT
>seq3 sub of seq1
ATCGATATATAT
>seq4 new
ATATATATCGATCG
>seq5 new
ATCGATCGATCGATCGTAGTCGCG
>seq6 new
ATCGATCGCGCGCGCGCGCGCGCGC
>seq7 sub of seq6
CGCGCGCGCGCGCG
$ g++ -Wall rm_redundantseq.cpp
$ ./a.out test.fasta
seq2 CGATCGATATATATATAT 18
seq5 ATCGATCGATCGATCGTAGTCGCG 24
seq6 ATCGATCGCGCGCGCGCGCGCGCGC 25
But I am expecting
seq1 ATCGATCGATATATATATATATAT 23
seq4 ATATATATCGATCG 14
seq5 ATCGATCGATCGATCGTAGTCGCG 24
seq6 ATCGATCGCGCGCGCGCGCGCGCGC 25
Likely Line 90 & 91 are of problem, but not quite sure.
Two questions I have here:
1) Line 90 Find substring from masterstring, as not all the entry are substring of the others. string.find() function seems return a pointer, but there is no error here. Why?
2) Line 91 remove the vector member (structure) thru iterator , is this the right way?
Google's for a while, could not get a clear answer.
Thanks a lot!