libStatGen Software 1
Loading...
Searching...
No Matches
FastQFile.cpp
1/*
2 * Copyright (C) 2010 Regents of the University of Michigan
3 *
4 * This program is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18#include <iostream>
19
20#include "InputFile.h"
21#include "FastQFile.h"
22#include "BaseUtilities.h"
23
24// Constructor.
25// minReadLength - The minimum length that a base sequence must be for
26// it to be valid.
27// numPrintableErrors - The maximum number of errors that should be reported
28// in detail before suppressing the errors.
29//
30FastQFile::FastQFile(int minReadLength, int numPrintableErrors)
31 : myFile(NULL),
32 myBaseComposition(),
33 myQualPerCycle(),
34 myCountPerCycle(),
35 myCheckSeqID(true),
36 myInterleaved(false),
37 myPrevSeqID(""),
38 myMinReadLength(minReadLength),
39 myNumPrintableErrors(numPrintableErrors),
40 myMaxErrors(-1),
41 myDisableMessages(false),
42 myFileProblem(false)
43{
44 // Reset the member data.
45 reset();
46}
47
48
50{
51 myDisableMessages = true;
52}
53
54
56{
57 myDisableMessages = false;
58}
59
60
61// Disable Unique Sequence ID checking.
62// Unique Sequence ID checking is enabled by default.
64{
65 myCheckSeqID = false;
66}
67
68
69// Enable Unique Sequence ID checking.
70// Unique Sequence ID checking is enabled by default.
72{
73 myCheckSeqID = true;
74}
75
76
77/// Interleaved.
79{
80 myInterleaved = true;
81}
82
83
84// Set the number of errors after which to quit reading/validating a file.
85void FastQFile::setMaxErrors(int maxErrors)
86{
87 myMaxErrors = maxErrors;
88}
89
90
91// Open a FastQFile.
94{
95 // reset the member data.
96 reset();
97
98 myBaseComposition.resetBaseMapType();
99 myBaseComposition.setBaseMapType(spaceType);
100 myQualPerCycle.clear();
101 myCountPerCycle.clear();
102
104
105 // Close the file if there is already one open - checked by close.
106 status = closeFile();
107 if(status == FastQStatus::FASTQ_SUCCESS)
108 {
109 // Successfully closed a previously opened file if there was one.
110
111 // Open the file
112 myFile = ifopen(fileName, "rt");
113 myFileName = fileName;
114
115 if(myFile == NULL)
116 {
117 // Failed to open the file.
119 }
120 }
121
122 if(status != FastQStatus::FASTQ_SUCCESS)
123 {
124 // Failed to open the file.
125 std::string errorMessage = "ERROR: Failed to open file: ";
126 errorMessage += fileName;
127 logMessage(errorMessage.c_str());
128 }
129 return(status);
130}
131
132
133// Close a FastQFile.
135{
136 int closeStatus = 0; // Success.
137
138 // If a file has been opened, close it.
139 if(myFile != NULL)
140 {
141 // Close the file.
142 closeStatus = ifclose(myFile);
143 myFile = NULL;
144 }
145 if(closeStatus == 0)
146 {
147 // Success - either there wasn't a file to close or it was closed
148 // successfully.
150 }
151 else
152 {
153 std::string errorMessage = "Failed to close file: ";
154 errorMessage += myFileName.c_str();
155 logMessage(errorMessage.c_str());
157 }
158}
159
160
161// Check to see if the file is open.
163{
164 // Check to see if the file is open.
165 if((myFile != NULL) && (myFile->isOpen()))
166 {
167 // File pointer exists and the file is open.
168 return true;
169 }
170
171 // File is not open.
172 return false;
173}
174
175
176// Check to see if the file is at the end of the file.
178{
179 // Check to see if the file is open.
180 if((myFile != NULL) && (ifeof(myFile)))
181 {
182 // At EOF.
183 return true;
184 }
185
186 // Not at EOF.
187 return false;
188}
189
190
191// Returns whether or not to keep reading the file.
192// Stop reading (false) if eof or there is a problem reading the file.
194{
195 if(isEof() || myFileProblem)
196 {
197 return(false);
198 }
199 return(true);
200}
201
202
203// Validate the specified fastq file
205 bool printBaseComp,
206 BaseAsciiMap::SPACE_TYPE spaceType,
207 bool printQualAvg)
208{
209 // Open the fastqfile.
210 if(openFile(filename, spaceType) != FastQStatus::FASTQ_SUCCESS)
211 {
212 // Failed to open the specified file.
214 }
215
216 // Track the total number of sequences that were validated.
217 int numSequences = 0;
218
219 // Keep reading the file until there are no more fastq sequences to process
220 // and not configured to quit after a certain number of errors or there
221 // has not yet been that many errors.
222 // Or exit if there is a problem reading the file.
224 while (keepReadingFile() &&
225 ((myMaxErrors == -1) || (myMaxErrors > myNumErrors)))
226 {
227 // Validate one sequence. This call will read all the lines for
228 // one sequence.
229 status = readFastQSequence();
230 if((status == FastQStatus::FASTQ_SUCCESS) || (status == FastQStatus::FASTQ_INVALID))
231 {
232 // Read a sequence and it is either valid or invalid, but
233 // either way, a sequence was read, so increment the sequence count.
234 ++numSequences;
235 }
236 else
237 {
238 // Other error, so break out of processing.
239 break;
240 }
241 }
242
243 // Report Base Composition Statistics.
244 if(printBaseComp)
245 {
246 myBaseComposition.print();
247 }
248
249 if(printQualAvg)
250 {
251 printAvgQual();
252 }
253
254 std::string finishMessage = "Finished processing ";
255 finishMessage += myFileName.c_str();
256 char buffer[100];
257 if(sprintf(buffer,
258 " with %u lines containing %d sequences.",
259 myLineNum, numSequences) > 0)
260 {
261 finishMessage += buffer;
262 logMessage(finishMessage.c_str());
263 }
264 if(sprintf(buffer,
265 "There were a total of %d errors.",
266 myNumErrors) > 0)
267 {
268 logMessage(buffer);
269 }
270
271 // Close the input file.
272 FastQStatus::Status closeStatus = closeFile();
273
274 if((status != FastQStatus::FASTQ_SUCCESS) && (status != FastQStatus::FASTQ_INVALID) &&
276 {
277 // Stopped validating due to some error other than invalid, so
278 // return that error.
279 return(status);
280 }
281 else if(myNumErrors == 0)
282 {
283 // No errors, check to see if there were any sequences.
284 // Finished processing all of the sequences in the file.
285 // If there are no sequences, report an error.
286 if(numSequences == 0)
287 {
288 // Empty file, return error.
289 logMessage("ERROR: No FastQSequences in the file.");
291 }
293 }
294 else
295 {
296 // The file is invalid. But check the close status. If the close
297 // failed, it means there is a problem with the file itself not just
298 // with validation, so the close failure should be returned.
299 if(closeStatus != FastQStatus::FASTQ_SUCCESS)
300 {
301 return(closeStatus);
302 }
304 }
305}
306
307
308// Reads and validates a single fastq sequence from myFile.
310{
311 // First verify that a file is open, if not, return failure.
312 if(!isOpen())
313 {
314 std::string message =
315 "ERROR: Trying to read a fastq file but no file is open.";
316 logMessage(message.c_str());
318 }
319
320 // Reset variables for each sequence.
321 resetForEachSequence();
322
323 bool valid = true;
324
325 // No sequence was read.
326 if(isTimeToQuit())
327 {
329 }
330
331 // The first line is the sequence identifier, so validate that.
332 valid = validateSequenceIdentifierLine();
333
334 if(myFileProblem)
335 {
337 }
338
339 // If we are at the end of the file, check to see if it is a partial
340 // sequence or just an empty line at the end.
341 if(ifeof(myFile))
342 {
343 // If the sequence identifier line was empty and we are at the
344 // end of the file, there is nothing more to validate.
345 if(mySequenceIdLine.Length() != 0)
346 {
347 // There was a sequence identifier line, so this is an incomplete
348 // sequence.
349 myErrorString = "Incomplete Sequence.\n";
350 reportErrorOnLine();
351
352 valid = false;
353 }
354 if(valid)
355 {
356 // Return failure - no sequences were left to read. At the end
357 // of the file. It wasn't invalid and it wasn't really an error.
359 }
360 else
361 {
363 }
364 }
365
366 // If enough errors, quit before reading any more.
367 if(isTimeToQuit())
368 {
369 // Means there was an error, so mark it as invalid.
371 }
372
373 // Validate the Raw Sequence Line(s) and the "+" line.
374 valid &= validateRawSequenceAndPlusLines();
375
376 if(myFileProblem)
377 {
379 }
380
381 // If enough errors, quit before reading any more.
382 if(isTimeToQuit())
383 {
385 }
386
387 // If it is the end of a file, it is missing the quality string.
388 if(ifeof(myFile))
389 {
390 // There was a sequence identifier line, so this is an incomplete
391 // sequence.
392 myErrorString = "Incomplete Sequence, missing Quality String.";
393 reportErrorOnLine();
394 valid = false;
396 }
397
398 // All that is left is to validate the quality string line(s).
399 valid &= validateQualityStringLines();
400
401 if(myFileProblem)
402 {
404 }
405
406 if(valid)
407 {
409 }
411}
412
413
414// Reads and validates the sequence identifier line of a fastq sequence.
415bool FastQFile::validateSequenceIdentifierLine()
416{
417 // Read the first line of the sequence.
418 int readStatus = mySequenceIdLine.ReadLine(myFile);
419
420 // Check to see if the read was successful.
421 if(readStatus <= 0)
422 {
423 // If EOF, not an error.
424 if(ifeof(myFile))
425 {
426 return true;
427 }
428 myFileProblem = true;
429 myErrorString = "Failure trying to read sequence identifier line";
430 reportErrorOnLine();
431 return false;
432 }
433
434 // If the line is 0 length and it is the end of the file, just
435 // return since this is the eof - no error.
436 if((mySequenceIdLine.Length() == 0) && (ifeof(myFile)))
437 {
438 // Not an error, just a new line at the end of the file.
439 return true;
440 }
441
442 // Increment the line number.
443 myLineNum++;
444
445 // Verify that the line has at least 2 characters: '@' and at least
446 // one character for the sequence identifier.
447 if(mySequenceIdLine.Length() < 2)
448 {
449 // Error. Sequence Identifier line not long enough.
450 myErrorString = "The sequence identifier line was too short.";
451 reportErrorOnLine();
452 return false;
453 }
454
455 // The sequence identifier line must start wtih a '@'
456 if(mySequenceIdLine[0] != '@')
457 {
458 // Error - sequence identifier line does not begin with an '@'.
459 myErrorString = "First line of a sequence does not begin with @";
460 reportErrorOnLine();
461 return false;
462 }
463
464 // Valid Sequence Identifier Line.
465
466 // The sequence identifier ends at the first space or at the end of the
467 // line if there is no space.
468 // Use fast find since this is a case insensitive search.
469 // Start at 1 since we know that 0 is '@'
470 int endSequenceIdentifier = mySequenceIdLine.FastFindChar(' ', 1);
471
472 // Check if a " " was found.
473 if(endSequenceIdentifier == -1)
474 {
475 // Did not find a ' ', so the identifier is the rest of the line.
476 // It starts at 1 since @ is at offset 0.
477 mySequenceIdentifier = (mySequenceIdLine.SubStr(1)).c_str();
478 }
479 else
480 {
481 // Found a ' ', so the identifier ends just before that.
482 // The sequence identifier must be at least 1 character long,
483 // therefore the endSequenceIdentifier must be greater than 1.
484 if(endSequenceIdentifier <= 1)
485 {
486 myErrorString =
487 "No Sequence Identifier specified before the comment.";
488 reportErrorOnLine();
489 return false;
490 }
491
492 mySequenceIdentifier =
493 (mySequenceIdLine.SubStr(1, endSequenceIdentifier - 1)).c_str();
494 }
495
496 // If myInterleaved, validate matches the previous seqID.
497 if(myInterleaved && (myPrevSeqID != ""))
498 {
499 // Valid if the sequence identifiers are identical or if
500 // the only difference is a trailing 1 or 2.
501 if(myPrevSeqID.compare(mySequenceIdentifier) != 0)
502 {
503 // Compare all but the last characters, then check the last characters for 1 or 2.
504 if((myPrevSeqID.compare(0, myPrevSeqID.length()-1, mySequenceIdentifier.c_str(), mySequenceIdentifier.Length()-1) != 0) ||
505 (((myPrevSeqID[myPrevSeqID.length()-1] != '1') || (mySequenceIdentifier[mySequenceIdentifier.Length()-1] != '2')) &&
506 (myPrevSeqID[myPrevSeqID.length()-1] != mySequenceIdentifier[mySequenceIdentifier.Length()-1])))
507 {
508 myErrorString = "Interleaved: consecutive reads do not have matching sequence identifiers: ";
509 myErrorString += mySequenceIdentifier.c_str();
510 myErrorString += " and ";
511 myErrorString += myPrevSeqID.c_str();
512 reportErrorOnLine();
513 myPrevSeqID.clear();
514 return(false);
515 }
516 }
517 myPrevSeqID.clear();
518 }
519 else
520 {
521 if(myInterleaved)
522 {
523 myPrevSeqID = mySequenceIdentifier.c_str();
524 }
525
526 // Check if sequence identifier should be validated for uniqueness if it is
527 // not the 2nd in an interleaved pair.
528 if(myCheckSeqID)
529 {
530 // Check to see if the sequenceIdentifier is a repeat by adding
531 // it to the set and seeing if it already existed.
532 std::pair<std::map<std::string, unsigned int>::iterator,bool> insertResult;
533 insertResult =
534 myIdentifierMap.insert(std::make_pair(mySequenceIdentifier.c_str(),
535 myLineNum));
536
537 if(insertResult.second == false)
538 {
539 // Sequence Identifier is a repeat.
540 myErrorString = "Repeated Sequence Identifier: ";
541 myErrorString += mySequenceIdentifier.c_str();
542 myErrorString += " at Lines ";
543 myErrorString += insertResult.first->second;
544 myErrorString += " and ";
545 myErrorString += myLineNum;
546 reportErrorOnLine();
547 return(false);
548 }
549 }
550 }
551
552 // Valid, return true.
553 return(true);
554}
555
556
557// Reads and validates the raw sequence line(s) and the plus line. Both are
558// included in one method since it is unknown when the raw sequence line
559// ends until you find the plus line that divides it from the quality
560// string. Since this method will read the plus line to know when the
561// raw sequence ends, it also validates that line.
562bool FastQFile::validateRawSequenceAndPlusLines()
563{
564 // Read the raw sequence.
565 int readStatus = myRawSequence.ReadLine(myFile);
566
567 myLineNum++;
568
569 if(readStatus <= 0)
570 {
571 myFileProblem = true;
572 myErrorString = "Failure trying to read sequence line";
573 reportErrorOnLine();
574 return false;
575 }
576
577 // Offset into the raw sequence to be validated.
578 int offset = 0;
579
580 // Validate the raw sequence.
581 bool valid = validateRawSequence(offset);
582
583 // Increment the offset for what was just read.
584 offset = myRawSequence.Length();
585
586 // The next line is either a continuation of the raw sequence or it starts
587 // with a '+'
588 // Keep validating til the '+' line or the end of file is found.
589 bool stillRawLine = true;
590 while(stillRawLine &&
591 !ifeof(myFile))
592 {
593 // If enough errors, quit before reading any more.
594 if(isTimeToQuit())
595 {
596 return(false);
597 }
598
599 // Read the next line.
600 // Read into the plus line, but if it isn't a plus line, then
601 // it will be copied into the raw sequence line.
602 readStatus = myPlusLine.ReadLine(myFile);
603 myLineNum++;
604
605 if(readStatus <= 0)
606 {
607 myFileProblem = true;
608 myErrorString = "Failure trying to read sequence/plus line";
609 reportErrorOnLine();
610 return false;
611 }
612
613 // Check if the next line is blank
614 if(myPlusLine.Length() == 0)
615 {
616 // The next line is blank. Assume it is part of the raw sequence and
617 // report an error since there are no valid characters on the line.
618 myErrorString =
619 "Looking for continuation of Raw Sequence or '+' instead found a blank line, assuming it was part of Raw Sequence.";
620 reportErrorOnLine();
621 }
622 // Check for the plus line.
623 else if(myPlusLine[0] == '+')
624 {
625 // This is the + line.
626 valid &= validateSequencePlus();
627 stillRawLine = false;
628 }
629 else
630 {
631 // Not a plus line, so assume this is a continuation of the Raw
632 // Sequence.
633 // Copy from the plus line to the raw sequence line.
634 myRawSequence += myPlusLine;
635 myPlusLine.SetLength(0);
636 valid &= validateRawSequence(offset);
637
638 // Increment the offset.
639 offset = myRawSequence.Length();
640 }
641 }
642
643 // If enough errors, quit before reading any more.
644 if(isTimeToQuit())
645 {
646 return(false);
647 }
648
649 // Now that the entire raw sequence has been obtained, check its length
650 // against the minimum allowed length.
651 if(myRawSequence.Length() < myMinReadLength)
652 {
653 // Raw sequence is not long enough - error.
654 myErrorString = "Raw Sequence is shorter than the min read length: ";
655 myErrorString += myRawSequence.Length();
656 myErrorString += " < ";
657 myErrorString += myMinReadLength;
658 reportErrorOnLine();
659 valid = false;
660 }
661
662 // If enough errors, quit before reading any more.
663 if(isTimeToQuit())
664 {
665 return(false);
666 }
667
668 // if the flag still indicates it is processing the raw sequence that means
669 // we reached the end of the file without a '+' line. So report that error.
670 if(stillRawLine)
671 {
672 myErrorString = "Reached the end of the file without a '+' line.";
673 reportErrorOnLine();
674 valid = false;
675 }
676
677 return(valid);
678}
679
680
681// Reads and validates the quality string line(s).
682bool FastQFile::validateQualityStringLines()
683{
684 // Read the quality string.
685 int readStatus = myQualityString.ReadLine(myFile);
686 myLineNum++;
687
688 if(readStatus <= 0)
689 {
690 myFileProblem = true;
691 myErrorString = "Failure trying to read quality line";
692 reportErrorOnLine();
693 return false;
694 }
695
696 // track the offset into the quality string to validate.
697 int offset = 0;
698
699 // Validate this line of the quality string.
700 bool valid = validateQualityString(offset);
701
702 offset = myQualityString.Length();
703
704 // Keep reading quality string lines until the length of the
705 // raw sequence has been hit or the end of the file is reached.
706 while((myQualityString.Length() < myRawSequence.Length()) &&
707 (!ifeof(myFile)))
708 {
709 // If enough errors, quit before reading any more.
710 if(isTimeToQuit())
711 {
712 return(false);
713 }
714
715 // Read another line of the quality string.
716 readStatus = myTempPartialQuality.ReadLine(myFile);
717 myLineNum++;
718
719 if(readStatus <= 0)
720 {
721 myFileProblem = true;
722 myErrorString = "Failure trying to read quality line";
723 reportErrorOnLine();
724 return false;
725 }
726
727 myQualityString += myTempPartialQuality;
728 myTempPartialQuality.Clear();
729
730 // Validate this line of the quality string.
731 valid &= validateQualityString(offset);
732 offset = myQualityString.Length();
733 }
734
735 // If enough errors, quit before reading any more.
736 if(isTimeToQuit())
737 {
738 return(false);
739 }
740
741 // Validate that the quality string length is the same as the
742 // raw sequence length.
743 if(myQualityString.Length() != myRawSequence.Length())
744 {
745 myErrorString = "Quality string length (";
746 myErrorString += myQualityString.Length();
747 myErrorString += ") does not equal raw sequence length (";
748 myErrorString += myRawSequence.Length();
749 myErrorString += ")";
750 reportErrorOnLine();
751 valid = false;
752 }
753 return(valid);
754}
755
756
757// Method to validate a line that contains part of the raw sequence.
758bool FastQFile::validateRawSequence(int offset)
759{
760 bool validBase = false;
761 bool valid = true;
762
763 // Loop through validating each character is valid for the raw sequence.
764 for(int sequenceIndex = offset; sequenceIndex < myRawSequence.Length();
765 sequenceIndex++)
766 {
767 // Update the composition for this position. Returns false if the
768 // character was not a valid base.
769 validBase =
770 myBaseComposition.updateComposition(sequenceIndex,
771 myRawSequence[sequenceIndex]);
772 // Check the return
773 if(!validBase)
774 {
775 // Error, found a value that is not a valid base character.
776 myErrorString = "Invalid character ('";
777 myErrorString += myRawSequence[sequenceIndex];
778 myErrorString += "') in base sequence.";
779 reportErrorOnLine();
780 valid = false;
781 // If enough errors, quit before reading any more.
782 if(isTimeToQuit())
783 {
784 return(false);
785 }
786 }
787 }
788 return(valid);
789}
790
791
792// Method to validate the "+" line that seperates the raw sequence and the
793// quality string.
794bool FastQFile::validateSequencePlus()
795{
796 // Validate that optional sequence identifier is the same
797 // as the one on the @ line.
798
799 // Check to see if there is more to the line than just the plus
800 int lineLength = myPlusLine.Length();
801
802 // If the line is only 1 character or the second character is a space,
803 // then there is no sequence identifier on this line and there is nothing
804 // further to validate.
805 if((lineLength == 1) || (myPlusLine[1] == ' '))
806 {
807 // No sequence identifier, so just return valid.
808 return true;
809 }
810
811 // There is a sequence identifier on this line, so validate that
812 // it matches the one from the associated @ line.
813 // The read in line must be at least 1 more character ('+') than the
814 // sequence identifier read from the '@' line.
815 // If it is not longer than the sequence identifier, then we know that it
816 // cannot be the same.
817 int sequenceIdentifierLength = mySequenceIdentifier.Length();
818 if(lineLength <= sequenceIdentifierLength)
819 {
820 myErrorString =
821 "Sequence Identifier on '+' line does not equal the one on the '@' line.";
822 reportErrorOnLine();
823 return false;
824 }
825
826 bool same = true;
827 int seqIndex = 0;
828 int lineIndex = 1; // Start at 1 since offset 0 has '+'
829
830 // Loop through the sequence index and the line buffer verifying they
831 // are the same until a difference is found or the end of the sequence
832 // identifier is found.
833 while((same == true) && (seqIndex < sequenceIdentifierLength))
834 {
835 if(myPlusLine[lineIndex] != mySequenceIdentifier[seqIndex])
836 {
837 myErrorString =
838 "Sequence Identifier on '+' line does not equal the one on the '@' line.";
839 reportErrorOnLine();
840 same = false;
841 }
842 lineIndex++;
843 seqIndex++;
844 }
845 return(same);
846}
847
848
849// Method to validate the quality string.
850bool FastQFile::validateQualityString(int offset)
851{
852 bool valid = true;
853 if(myQualityString.Length() > (int)(myQualPerCycle.size()))
854 {
855 myQualPerCycle.resize(myQualityString.Length());
856 myCountPerCycle.resize(myQualityString.Length());
857 }
858 // For each character in the line, verify that it is ascii > 32.
859 for(int i=offset; i < myQualityString.Length(); i++)
860 {
861 if(myQualityString[i] <= 32)
862 {
863 myErrorString = "Invalid character ('";
864 myErrorString += myQualityString[i];
865 myErrorString += "') in quality string.";
866 reportErrorOnLine();
867 valid = false;
868 // If enough errors, quit before reading any more.
869 if(isTimeToQuit())
870 {
871 return(false);
872 }
873 }
874 else
875 {
876 myQualPerCycle[i] += BaseUtilities::getPhredBaseQuality(myQualityString[i]);
877 myCountPerCycle[i] += 1;
878 }
879 }
880 return(valid);
881}
882
883
884// Helper method for printing the contents of myErrorString. It will
885// only print the errors until the maximum number of reportable errors is
886// reached.
887void FastQFile::reportErrorOnLine()
888{
889 // Increment the total number of errors.
890 myNumErrors++;
891
892 // Only display the first X number of errors.
893 if(myNumErrors <= myNumPrintableErrors)
894 {
895 // Write the error with the line number.
896 char buffer[100];
897 sprintf(buffer, "ERROR on Line %u: ", myLineNum);
898 std::string message = buffer;
899 message += myErrorString.c_str();
900 logMessage(message.c_str());
901 }
902}
903
904
905// Reset member data that is unique for each fastQFile.
906void FastQFile::reset()
907{
908 // Each fastq file processing needs to also reset the member data for each
909 // sequence.
910 resetForEachSequence();
911 myNumErrors = 0; // per fastqfile
912 myLineNum = 0; // per fastqfile
913 myFileName.SetLength(0); // reset the filename string.
914 myIdentifierMap.clear(); // per fastqfile
915 myBaseComposition.clear(); // clear the base composition.
916 myQualPerCycle.clear();
917 myCountPerCycle.clear();
918 myFileProblem = false;
919}
920
921
922// Reset the member data that is unique for each sequence.
923void FastQFile::resetForEachSequence()
924{
925 myLineBuffer.SetLength(0);
926 myErrorString.SetLength(0);
927 myRawSequence.SetLength(0);
928 mySequenceIdLine.SetLength(0);
929 mySequenceIdentifier.SetLength(0);
930 myPlusLine.SetLength(0);
931 myQualityString.SetLength(0);
932 myTempPartialQuality.SetLength(0);
933}
934
935
936void FastQFile::logMessage(const char* logMessage)
937{
938 // Write the message if they are not disabled.
939 if(!myDisableMessages)
940 {
941 std::cout << logMessage << std::endl;
942 }
943}
944
945
946// Determine if it is time to quit by checking if we are to quit after a
947// certain number of errors and that many errors have been encountered.
948bool FastQFile::isTimeToQuit()
949{
950 // It is time to quit if we are to quit after a certain number of errors
951 // and that many errors have been encountered.
952 if((myMaxErrors != -1) && (myNumErrors >= myMaxErrors))
953 {
954 return(true);
955 }
956 return(false);
957}
958
959
960void FastQFile::printAvgQual()
961{
962 std::cout << std::endl << "Average Phred Quality by Read Index (starts at 0):" << std::endl;
963 std::cout.precision(2);
964 std::cout << std::fixed << "Read Index\tAverage Quality"
965 << std::endl;
966 if(myQualPerCycle.size() != myCountPerCycle.size())
967 {
968 // This is a code error and should NEVER happen.
969 std::cerr << "ERROR calculating the average Qualities per cycle\n";
970 }
971
972 double sumQual = 0;
973 double count = 0;
974 double avgQual = 0;
975 for(unsigned int i = 0; i < myQualPerCycle.size(); i++)
976 {
977 avgQual = 0;
978 if(myCountPerCycle[i] != 0)
979 {
980 avgQual = myQualPerCycle[i] / (double)(myCountPerCycle[i]);
981 }
982 std::cout << i << "\t" << avgQual << "\n";
983 sumQual += myQualPerCycle[i];
984 count += myCountPerCycle[i];
985 }
986 std::cout << std::endl;
987 avgQual = 0;
988 if(count != 0)
989 {
990 avgQual = sumQual / count;
991 }
992 std::cout << "Overall Average Phred Quality = " << avgQual << std::endl;
993}
int ifeof(IFILE file)
Check to see if we have reached the EOF (returns 0 if not EOF).
Definition InputFile.h:654
IFILE ifopen(const char *filename, const char *mode, InputFile::ifileCompression compressionMode=InputFile::DEFAULT)
Open a file with the specified name and mode, using a filename of "-" to indicate stdin/stdout.
Definition InputFile.h:562
int ifclose(IFILE &file)
Close the file.
Definition InputFile.h:580
SPACE_TYPE
The type of space (color or base) to use in the mapping.
bool updateComposition(unsigned int rawSequenceCharIndex, char baseChar)
Update the composition for the specified index with the specified character.
void clear()
Clear the composition stored in the base count vector.
void setBaseMapType(BaseAsciiMap::SPACE_TYPE spaceType)
Set the base map type for this composition.
void print()
Print the composition.
void resetBaseMapType()
Reset the base map type for this composition.
static uint8_t getPhredBaseQuality(char charQuality)
Get phred base quality from the specified ascii quality.
void interleaved()
Interleaved.
Definition FastQFile.cpp:78
FastQStatus::Status openFile(const char *fileName, BaseAsciiMap::SPACE_TYPE spaceType=BaseAsciiMap::UNKNOWN)
Open a FastQFile.
Definition FastQFile.cpp:92
void enableSeqIDCheck()
Enable Unique Sequence ID checking.
Definition FastQFile.cpp:71
void disableMessages()
Disable messages - do not write to cout.
Definition FastQFile.cpp:49
bool isOpen()
Check to see if the file is open.
void disableSeqIDCheck()
Disable Unique Sequence ID checking (Unique Sequence ID checking is enabled by default).
Definition FastQFile.cpp:63
FastQStatus::Status readFastQSequence()
Read 1 FastQSequence, validating it.
FastQStatus::Status closeFile()
Close a FastQFile.
FastQStatus::Status validateFastQFile(const String &filename, bool printBaseComp, BaseAsciiMap::SPACE_TYPE spaceType, bool printQualAvg=false)
Validate the specified fastq file.
bool keepReadingFile()
Returns whether or not to keep reading the file, it stops reading (false) if eof or there is a proble...
void enableMessages()
Enable messages - write to cout.
Definition FastQFile.cpp:55
void setMaxErrors(int maxErrors)
Set the number of errors after which to quit reading/validating a file, defaults to -1.
Definition FastQFile.cpp:85
FastQFile(int minReadLength=10, int numPrintableErrors=20)
Constructor.
Definition FastQFile.cpp:30
bool isEof()
Check to see if the file is at the end of the file.
Status
Return value enum for the FastQFile class methods, indicating success or error codes.
Definition FastQStatus.h:31
@ FASTQ_ORDER_ERROR
means the methods are called out of order, like trying to read a file before opening it.
Definition FastQStatus.h:34
@ FASTQ_READ_ERROR
means that a problem occurred on a read.
Definition FastQStatus.h:37
@ FASTQ_SUCCESS
indicates method finished successfully.
Definition FastQStatus.h:32
@ FASTQ_INVALID
means that the sequence was invalid.
Definition FastQStatus.h:33
@ FASTQ_OPEN_ERROR
means the file could not be opened.
Definition FastQStatus.h:35
@ FASTQ_NO_SEQUENCE_ERROR
means there were no errors, but no sequences read.
Definition FastQStatus.h:38
@ FASTQ_CLOSE_ERROR
means the file could not be closed.
Definition FastQStatus.h:36
bool isOpen() const
Returns whether or not the file was successfully opened.
Definition InputFile.h:423