// lab2.cc // Sequence matching driver for CS 241 Lab 2, Spring 2003 // // SYNTAX: lab2 [ ] // // This sequence matching program reads two sequences, a CORPUS and a PATTERN, // from files on disk and finds all strings of a given length M (given // interactively by the user) that occur in both corpus and pattern. // Matching substrings are printed in the form // // where the two indices are the offsets of the match within the corpus and // pattern, respectively, and is the actual matching string. // // As an optional third argument, the program can take a file containing a // MASK sequence. Substrings of the mask sequence are considered // "uninteresting" and so must not be reported by the matching code. // To implement this requirement, we delete any occurrences of substrings // in the mask sequence from our pattern table before performing the search. // #include using namespace std; #include "SeqReader.h" #include "StringDictionary.h" // // local prototypes // static StringDictionary *createTable(const char *patternSeq, int patternLength, int matchLength); static void maskTable(StringDictionary *table, const char *maskSeq, int maskLength, int matchLength); static void findMatches(StringDictionary *table, const char *corpusSeq, int corpusLength, int matchLength); int main(int argc, char *argv[]){ const char *corpusSeq = NULL; const char *patternSeq = NULL; const char *maskSeq = NULL; int corpusLength, patternLength, maskLength; if (argc < 3) { cout << "Syntax: Lab2 []\n"; exit(1); } else { corpusSeq = readSeq(argv[1], &corpusLength); patternSeq = readSeq(argv[2], &patternLength); cout << "CORPUS: " << corpusLength << " bases" << endl;; cout << "PATTERN: " << patternLength << " bases" << endl; if (argc > 3){ maskSeq = readSeq(argv[3], &maskLength); cout << "MASK: " << maskLength << " bases" << endl; } } // Interactively get the desired match length // int matchLength; cout << "Match length? "; cin >> matchLength; StringDictionary *table = createTable(patternSeq, patternLength, matchLength); cout << "\nAfter creating the table, it holds "; cout << table->size() << " sequences of length " << matchLength << endl; if (maskSeq != NULL) maskTable(table, maskSeq, maskLength, matchLength); cout << "After removing the mask sequences, the table holds "; cout << table->size() << " sequences of length " << matchLength << endl << endl; findMatches(table, corpusSeq, corpusLength, matchLength); return 0; } // Create a new StringDictionary containing all substrings of the pattern // sequence. // static StringDictionary *createTable(const char *patternSeq, int patternLength, int matchLength) { StringDictionary *table = new StringDictionary(patternLength, matchLength); Record *rec; for (int j = 0; j < patternLength - matchLength + 1; j++){ const char *key = patternSeq + j; if (table->contains(key)){ rec = table->get(key); rec->positions.add(j); } else { rec = new Record(); rec->positions.add(j); table->put(key,rec); } } return table; } // Remove all substrings in the mask sequence from a StringDictionary. // static void maskTable(StringDictionary *table, const char *maskSeq, int maskLength, int matchLength){ for (int j = 0; j < maskLength - matchLength + 1; j++){ const char *key = maskSeq + j; Record* rec = table->remove(key); if (rec != NULL) delete rec; // don't need this struct anymore } } // Find and print all matches between the corpus sequence and any // string in a StringDictionary. // static void findMatches(StringDictionary *table, const char *corpusSeq, int corpusLength, int matchLength){ int numMatches = 0; for (int j = 0; j < corpusLength - matchLength + 1; j++){ const char *key = corpusSeq + j; if (table->contains(key)){ const Record *rec = table->get(key); for (unsigned int k = 0; k < rec->positions.length(); k++){ cout << j << ' ' << rec->positions[k] << ' '; for (int p = 0; p < matchLength; p++) cout << key[p]; numMatches++; cout << endl; } } } cout << "\nThere were " << numMatches << " matches found." << endl; }