19#include "SamFileHeader.h"
21#include "BamInterface.h"
22#include "SamInterface.h"
23#include "BgzfFileType.h"
36 : myStatus(errorHandlingType)
48 init(filename, mode, NULL);
56 : myStatus(errorHandlingType)
58 init(filename, mode, NULL);
67 init(filename, mode, header);
76 : myStatus(errorHandlingType)
78 init(filename, mode, header);
100 while (filename[lastchar] != 0) lastchar++;
103 if((lastchar >= 1) && (filename[0] ==
'-'))
107 if(strcmp(filename,
"-.bam") == 0)
123 ifread(myFilePtr, magic, 4);
125 else if(strcmp(filename,
"-.ubam") == 0)
135#ifdef __ZLIB_AVAILABLE__
136 BgzfFileType::setRequireEofBlock(
false);
144 ifread(myFilePtr, magic, 4);
146 else if((strcmp(filename,
"-") == 0) || (strcmp(filename,
"-.sam") == 0))
156 std::string errorMessage =
"Invalid SAM/BAM filename, ";
157 errorMessage += filename;
158 errorMessage +=
". From stdin, can only be '-', '-.sam', '-.bam', or '-.ubam'";
178 std::string errorMessage =
"Failed to Open ";
179 errorMessage += filename;
180 errorMessage +=
" for reading";
188 ifread(myFilePtr, magic, 4);
190 if (magic[0] ==
'B' && magic[1] ==
'A' && magic[2] ==
'M' &&
229 while (filename[lastchar] != 0) lastchar++;
231 filename[lastchar - 4] ==
'u' &&
232 filename[lastchar - 3] ==
'b' &&
233 filename[lastchar - 2] ==
'a' &&
234 filename[lastchar - 1] ==
'm')
238 if((lastchar == 6) && (filename[0] ==
'-') && (filename[1] ==
'.'))
247 else if (lastchar >= 3 &&
248 filename[lastchar - 3] ==
'b' &&
249 filename[lastchar - 2] ==
'a' &&
250 filename[lastchar - 1] ==
'm')
254 if((lastchar == 5) && (filename[0] ==
'-') && (filename[1] ==
'.'))
267 if((lastchar >= 1) && (filename[0] ==
'-'))
276 if (myFilePtr == NULL)
278 std::string errorMessage =
"Failed to Open ";
279 errorMessage += filename;
280 errorMessage +=
" for writing";
303 if(myBamIndex != NULL)
315 std::string errorMessage =
"Failed to read the bam Index file: ";
316 errorMessage += bamIndexFilename;
330 if(myFilePtr == NULL)
334 std::string errorMessage =
"Failed to read the bam Index file -"
335 " the BAM file needs to be read first in order to determine"
336 " the index filename.";
341 const char* bamBaseName = myFilePtr->
getFileName();
343 std::string indexName = bamBaseName;
346 bool foundFile =
true;
354 catch (std::exception&)
364 size_t startExt = indexName.find(
".bam");
365 if(startExt == std::string::npos)
372 indexName.erase(startExt, 4);
382 myRefPtr = reference;
389 myReadTranslation = translation;
396 myWriteTranslation = translation;
412 if (myFilePtr != NULL)
415 return(myFilePtr->
isOpen());
431 return(myInterfacePtr->isEOF(myFilePtr));
439 if (myFilePtr != NULL)
457 "Cannot read header since the file is not open for reading");
465 "Cannot read header since it has already been read.");
469 if(myInterfacePtr->readHeader(myFilePtr, header,
myStatus))
489 "Cannot write header since the file is not open for writing");
497 "Cannot write header since it has already been written");
501 if(myInterfacePtr->writeHeader(myFilePtr, header,
myStatus))
523 "Cannot read record since the file is not open for reading");
524 throw(std::runtime_error(
"SOFTWARE BUG: trying to read a SAM/BAM record prior to opening the file."));
533 "Cannot read record since the header has not been read.");
534 throw(std::runtime_error(
"SOFTWARE BUG: trying to read a SAM/BAM record prior to reading the header."));
542 if(!processNewSection(header))
563 if(!ensureIndexedReadPosition())
573 myInterfacePtr->readRecord(myFilePtr, header, record,
myStatus);
585 if(!checkRecordInSection(record))
593 uint16_t flag = record.
getFlag();
594 if((flag & myRequiredFlags) != myRequiredFlags)
600 if((flag & myExcludedFlags) != 0)
639 "Cannot write record since the file is not open for writing");
647 "Cannot write record since the header has not been written");
656 "Cannot write the record since the file is not properly sorted.");
667 myStatus = myInterfacePtr->writeRecord(myFilePtr, header, record,
684 mySortedType = sortType;
723 "Cannot set section since there is no bam file open");
728 myOverlapSection = overlap;
733 myChunksToRead.clear();
736 myCurrentChunkEnd = 0;
742 myPrevReadName.Clear();
759 "Cannot set section since there is no bam file open");
764 myOverlapSection = overlap;
767 if((strcmp(refName,
"") == 0) || (strcmp(refName,
"*") == 0))
779 myChunksToRead.clear();
782 myCurrentChunkEnd = 0;
788 myPrevReadName.Clear();
796 myRequiredFlags = requiredFlags;
797 myExcludedFlags = excludedFlags;
806 if(myBamIndex == NULL)
809 "Cannot get num mapped reads from the index until it has been read.");
821 if(myBamIndex == NULL)
824 "Cannot get num unmapped reads from the index until it has been read.");
837 if(myBamIndex == NULL)
840 "Cannot get num mapped reads from the index until it has been read.");
844 if((strcmp(refName,
"") != 0) && (strcmp(refName,
"*") != 0))
859 if(myBamIndex == NULL)
862 "Cannot get num unmapped reads from the index until it has been read.");
866 if((strcmp(refName,
"") != 0) && (strcmp(refName,
"*") != 0))
926 myInterfacePtr = NULL;
932 myAttemptRecovery =
false;
938void SamFile::init(
const char* filename, OpenType mode,
SamFileHeader* header)
944 bool openStatus =
true;
959 std::cerr <<
"FAILURE - EXITING!!!" << std::endl;
969 if (myFilePtr != NULL)
975 if(myInterfacePtr != NULL)
977 delete myInterfacePtr;
978 myInterfacePtr = NULL;
985 myPrevReadName.Clear();
996 myNewSection =
false;
997 myOverlapSection =
true;
998 myCurrentChunkEnd = 0;
999 myChunksToRead.clear();
1000 if(myBamIndex != NULL)
1021 if(myRefPtr != NULL)
1027 bool status =
false;
1036 if(mySortedType ==
FLAG)
1039 mySortedType = getSortOrderFromHeader(header);
1049 if((myPrevReadName.Compare(readName) > 0) &&
1050 (strcmp(myPrevReadName.c_str(), readName) > 0))
1053 String errorMessage =
"ERROR: File is not sorted by read name at record ";
1055 errorMessage +=
"\n\tPrevious record was ";
1056 errorMessage += myPrevReadName;
1057 errorMessage +=
", but this record is ";
1058 errorMessage += readName;
1060 errorMessage.c_str());
1065 myPrevReadName = readName;
1081 myPrevRefID = refID;
1088 String errorMessage =
"ERROR: File is not coordinate sorted at record ";
1090 errorMessage +=
"\n\tPrevious record was unmapped, but this record is ";
1093 errorMessage.c_str());
1096 else if(refID < myPrevRefID)
1100 String errorMessage =
"ERROR: File is not coordinate sorted at record ";
1102 errorMessage +=
"\n\tPrevious record was ";
1104 errorMessage +=
", but this record is ";
1107 errorMessage.c_str());
1113 if(refID > myPrevRefID)
1123 String errorMessage =
"ERROR: File is not coordinate sorted at record ";
1125 errorMessage +=
"\n\tPreviousRecord was ";
1127 errorMessage +=
", but this record is ";
1130 errorMessage.c_str());
1135 myPrevRefID = refID;
1154 if(strcmp(tag,
"queryname") == 0)
1158 else if(strcmp(tag,
"coordinate") == 0)
1162 return(headerSortOrder);
1166 bool SamFile::ensureIndexedReadPosition()
1178 uint64_t currentPos =
iftell(myFilePtr);
1179 if(currentPos >= myCurrentChunkEnd)
1182 if(myChunksToRead.empty())
1189 Chunk newChunk = myChunksToRead.pop();
1193 if(newChunk.chunk_beg != currentPos)
1196 if(
ifseek(myFilePtr, newChunk.chunk_beg, SEEK_SET) !=
true)
1200 "Failed to seek to the next record");
1205 myCurrentChunkEnd = newChunk.chunk_end;
1211bool SamFile::checkRecordInSection(
SamRecord& record)
1213 bool recordFound =
true;
1248 recordFound =
false;
1251 if(!myOverlapSection)
1260 ((myEndPos != -1) &&
1265 recordFound =
false;
1269 return(recordFound);
1275 myNewSection =
false;
1278 if(myBamIndex == NULL)
1282 "Cannot read section since there is no index file open");
1283 throw(std::runtime_error(
"SOFTWARE BUG: trying to read a BAM record by section prior to opening the BAM Index file."));
1292 "Cannot read section since there is no bam file open");
1293 throw(std::runtime_error(
"SOFTWARE BUG: trying to read a BAM record by section without opening a BAM file."));
1301 "Cannot read record since the header has not been read.");
1302 throw(std::runtime_error(
"SOFTWARE BUG: trying to read a BAM record by section prior to opening the header."));
1312 myChunksToRead.clear();
1315 myCurrentChunkEnd = 0;
1319 if(!myRefName.empty())
1335 myChunksToRead) ==
true)
1341 String errorMsg =
"Failed to get the specified region, refID = ";
1342 errorMsg += myRefID;
1343 errorMsg +=
"; startPos = ";
1344 errorMsg += myStartPos;
1345 errorMsg +=
"; endPos = ";
1346 errorMsg += myEndPos;
1365bool SamFile::attemptRecoverySync(
bool (*checkSignature)(
void *data) ,
int length)
1367 if(myFilePtr==NULL)
return false;
1369 return myFilePtr->attemptRecoverySync(checkSignature, length);
1391 :
SamFile(filename, READ, errorHandlingType)
1399 :
SamFile(filename, READ, header)
1408 :
SamFile(filename, READ, errorHandlingType, header)
1413SamFileReader::~SamFileReader()
1435 :
SamFile(filename, WRITE, errorHandlingType)
1443 :
SamFile(filename, WRITE, header)
1452 :
SamFile(filename, WRITE, errorHandlingType, header)
1457SamFileWriter::~SamFileWriter()
bool getChunksForRegion(int32_t refID, int32_t start, int32_t end, SortedChunkList &chunkList)
Get the list of chunks associated with this region.
int32_t getNumMappedReads(int32_t refID)
Get the number of mapped reads for this reference id.
SamStatus::Status readIndex(const char *filename)
static const int32_t REF_ID_ALL
The number used to indicate that all reference ids should be used.
int32_t getNumUnMappedReads(int32_t refID)
Get the number of unmapped reads for this reference id.
static const int32_t REF_ID_UNMAPPED
The number used for the reference id of unmapped reads.
HandlingType
This specifies how this class should respond to errors.
Create/Access/Modify/Load Genome Sequences stored as binary mapped files.
SamFileReader()
Default Constructor.
SamFileWriter()
Default Constructor.
Allows the user to easily read/write a SAM/BAM file.
bool ReadHeader(SamFileHeader &header)
Reads the header section from the file and stores it in the passed in header.
bool IsOpen()
Returns whether or not the file has been opened successfully.
int32_t getNumUnMappedReadsFromIndex(int32_t refID)
Get the number of unmapped reads in the specified reference id.
void Close()
Close the file if there is one open.
void resetFile()
Resets the file prepping for a new file.
int32_t myPrevCoord
Previous values used for checking if the file is sorted.
bool myIsOpenForRead
Flag to indicate if a file is open for reading.
bool ReadBamIndex()
Read the bam index file using the BAM filename as a base.
bool SetReadSection(int32_t refID)
Sets which reference id (index into the BAM list of reference information) of the BAM file should be ...
SortedType
Enum for indicating the type of sort expected in the file.
@ UNSORTED
file is not sorted.
@ FLAG
SO flag from the header indicates the sort type.
@ QUERY_NAME
file is sorted by queryname.
@ COORDINATE
file is sorted by coordinate.
bool ReadRecord(SamFileHeader &header, SamRecord &record)
Reads the next record from the file & stores it in the passed in record.
bool IsStream()
Returns whether or not the file has been opened for streaming input/output.
void SetReadFlags(uint16_t requiredFlags, uint16_t excludedFlags)
Specify which reads should be returned by ReadRecord.
void SetReference(GenomeSequence *reference)
Sets the reference to the specified genome sequence object.
OpenType
Enum for indicating whether to open the file for read or write.
void GenerateStatistics(bool genStats)
Whether or not statistics should be generated for this file.
const BamIndex * GetBamIndex()
Return the bam index if one has been opened.
uint32_t GetNumOverlaps(SamRecord &samRecord)
Returns the number of bases in the passed in read that overlap the region that is currently set.
bool OpenForRead(const char *filename, SamFileHeader *header=NULL)
Open a sam/bam file for reading with the specified filename, determing the type of file and SAM/BAM b...
bool IsEOF()
Returns whether or not the end of the file has been reached.
int32_t getNumMappedReadsFromIndex(int32_t refID)
Get the number of mapped reads in the specified reference id.
bool OpenForWrite(const char *filename, SamFileHeader *header=NULL)
Open a sam/bam file for writing with the specified filename, determining SAM/BAM from the extension (...
bool WriteHeader(SamFileHeader &header)
Writes the specified header into the file.
bool myIsOpenForWrite
Flag to indicate if a file is open for writing.
bool myIsBamOpenForRead
Values for reading Sorted BAM files via the index.
bool myHasHeader
Flag to indicate if a header has been read/written - required before being able to read/write a recor...
virtual ~SamFile()
Destructor.
SamFile()
Default Constructor, initializes the variables, but does not open any files.
void SetWriteSequenceTranslation(SamRecord::SequenceTranslation translation)
Set the type of sequence translation to use when writing the sequence.
void SetReadSequenceTranslation(SamRecord::SequenceTranslation translation)
Set the type of sequence translation to use when reading the sequence.
bool validateSortOrder(SamRecord &record, SamFileHeader &header)
Validate that the record is sorted compared to the previously read record if there is one,...
bool WriteRecord(SamFileHeader &header, SamRecord &record)
Writes the specified record into the file.
uint32_t myRecordCount
Keep a count of the number of records that have been read/written so far.
SamStatus myStatus
The status of the last SamFile command.
const char * GetStatusMessage()
Get the Status Message of the last call that sets status.
uint32_t GetCurrentRecordCount()
Return the number of records that have been read/written so far.
void setSortedValidation(SortedType sortType)
Set the flag to validate that the file is sorted as it is read/written.
SamStatistics * myStatistics
Pointer to the statistics for this file.
Class providing an easy to use interface to get/set/operate on the fields in a SAM/BAM record.
SequenceTranslation
Enum containing the settings on how to translate the sequence if a reference is available.
@ NONE
Leave the sequence as is.
int32_t getReferenceID()
Get the reference sequence id of the record (BAM format rid).
uint16_t getFlag()
Get the flag (FLAG).
void setSequenceTranslation(SequenceTranslation translation)
Set the type of sequence translation to use when getting the sequence.
uint32_t getNumOverlaps(int32_t start, int32_t end)
Return the number of bases in this read that overlap the passed in region.
void setReference(GenomeSequence *reference)
Set the reference to the specified genome sequence object.
int32_t get0BasedAlignmentEnd()
Returns the 0-based inclusive rightmost position of the clipped sequence.
int32_t get0BasedPosition()
Get the 0-based(BAM) leftmost position of the record.
const char * getReadName()
Returns the SAM formatted Read Name (QNAME).
static const int NO_REF_ID
Constant for the value returned if a reference id does not exist for a queried reference name.
Status
Return value enum for StatGenFile methods.
@ NO_MORE_RECS
NO_MORE_RECS: failed to read a record since there are no more to read either in the file or section i...
@ SUCCESS
method completed successfully.
@ FAIL_IO
method failed due to an I/O issue.
@ INVALID_SORT
record is invalid due to it not being sorted.
@ FAIL_PARSE
failed to parse a record/header - invalid format.
@ FAIL_ORDER
FAIL_ORDER: method failed because it was called out of order, like trying to read a file without open...
void setStatus(Status newStatus, const char *newMessage)
Set the status with the specified status enum and message.