libStatGen Software 1
Loading...
Searching...
No Matches
SamRecord.h
1/*
2 * Copyright (C) 2010-2011 Regents of the University of Michigan
3 *
4 * This program is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18#ifndef __SAM_RECORD_H__
19#define __SAM_RECORD_H__
20
21#include <stdint.h>
22
23#include "GenomeSequence.h"
24#include "SamStatus.h"
25#include "LongHash.h"
26#include "MathVector.h"
27#include "StringArray.h"
28#include "IntArray.h"
29#include "SamFileHeader.h"
30#include "CigarRoller.h"
31
32/// Structure of a BAM record.
34{
35public:
36 int32_t myBlockSize;
37 int32_t myReferenceID;
38 int32_t myPosition;
39 uint32_t myReadNameLength : 8, myMapQuality : 8, myBin : 16;
40 uint32_t myCigarLength : 16, myFlag : 16;
41 int32_t myReadLength;
42 int32_t myMateReferenceID;
43 int32_t myMatePosition;
44 int32_t myInsertSize; // Outer fragment length
45 char myData[1];
46};
47
48
49/// Class providing an easy to use interface to get/set/operate on the
50/// fields in a SAM/BAM record.
52{
53public:
54 /// Enum containing the settings on how to translate the sequence if a
55 /// reference is available. If no reference is available, no translation
56 /// is done.
58 NONE, ///< Leave the sequence as is.
59 EQUAL, ///< Translate bases that match the reference to '='
60 BASES, ///< Translate '=' to the actual base.
61 };
62
63 /// Default Constructor.
64 SamRecord();
65
66 /// Constructor that sets the error handling type.
67 /// \param errorHandlingType how to handle errors.
68 SamRecord(ErrorHandler::HandlingType errorHandlingType);
69
70 /// Destructor
71 ~SamRecord();
72
73 /// Reset the fields of the record to a default value.
74 /// This is not necessary when you are reading a SAM/BAM file,
75 /// but if you are setting fields, it is a good idea to clean
76 /// out a record before reusing it. Clearing it allows you to
77 /// not have to set any empty fields.
78 void resetRecord();
79
80 /// Returns whether or not the record is valid, setting the status to
81 /// indicate success or failure.
82 /// \param header SAM Header associated with the record. Used to perform
83 /// some validation against the header.
84 /// \return true if the record is valid, false if not.
85 bool isValid(SamFileHeader& header);
86
87 /// Set the reference to the specified genome sequence object.
88 /// \param reference pointer to the GenomeSequence object.
89 void setReference(GenomeSequence* reference);
90
91 /// Set the type of sequence translation to use when getting
92 /// the sequence. The default type (if this method is never called) is
93 /// NONE (the sequence is left as-is). Can be over-ridden by using
94 /// the accessors that take a SequenceTranslation parameter.
95 /// \param translation type of sequence translation to use.
97
98 ///////////////////////
99 /// @name Set Alignment Data
100 /// Set methods for record fields. All of the "set" methods set the
101 /// status to indicate success or the failure reason.
102 //@{
103
104 /// Set QNAME to the passed in name.
105 /// \param readName the readname to set the QNAME to.
106 /// \return true if successfully set, false if not.
107 bool setReadName(const char* readName);
108
109 /// Set the bitwise FLAG to the specified value.
110 /// \param flag integer flag to use.
111 /// \return true if successfully set, false if not.
112 bool setFlag(uint16_t flag);
113
114 /// Set the reference sequence name (RNAME) to the specified name, using
115 /// the header to determine the reference id.
116 /// \param header SAM/BAM header to use to determine the reference id.
117 /// \param referenceName reference name to use.
118 /// \return true if successfully set, false if not
119 bool setReferenceName(SamFileHeader& header,
120 const char* referenceName);
121
122 /// Set the leftmost position (POS) using the specified 1-based (SAM format)
123 /// value.
124 /// Internal processing handles the switching between SAM/BAM formats
125 /// when read/written.
126 /// \param position 1-based start position
127 /// \return true if successfully set, false if not.
128 bool set1BasedPosition(int32_t position);
129
130 /// Set the leftmost position using the specified 0-based (BAM format)
131 /// value.
132 /// Internal processing handles the switching between SAM/BAM formats
133 /// when read/written.
134 /// \param position 0-based start position
135 /// \return true if successfully set, false if not.
136 bool set0BasedPosition(int32_t position);
137
138 /// Set the mapping quality (MAPQ).
139 /// \param mapQuality map quality to set in the record.
140 /// \return true if successfully set, false if not.
141 bool setMapQuality(uint8_t mapQuality);
142
143 /// Set the CIGAR to the specified SAM formatted cigar string.
144 /// Internal processing handles the switching between SAM/BAM formats
145 /// when read/written.
146 /// \param cigar string containing the SAM formatted cigar.
147 /// \return true if successfully set, false if not.
148 bool setCigar(const char* cigar);
149
150 /// Set the CIGAR to the specified Cigar object.
151 /// Internal processing handles the switching between SAM/BAM formats
152 /// when read/written.
153 /// \param cigar object to set this record's cigar to have.
154 /// \return true if successfully set, false if not.
155 bool setCigar(const Cigar& cigar);
156
157
158 /// Set the mate/next fragment's reference sequence name (RNEXT) to the
159 /// specified name, using the header to determine the mate reference id.
160 /// \param header SAM/BAM header to use to determine the mate reference id.
161 /// \param referenceName mate reference name to use.
162 /// \return true if successfully set, false if not
164 const char* mateReferenceName);
165
166 /// Set the mate/next fragment's leftmost position (PNEXT) using the
167 /// specified 1-based (SAM format) value.
168 /// Internal processing handles the switching between SAM/BAM formats
169 /// when read/written.
170 /// \param position 1-based start position
171 /// \return true if successfully set, false if not.
172 bool set1BasedMatePosition(int32_t matePosition);
173
174 /// Set the mate/next fragment's leftmost position using the specified
175 /// 0-based (BAM format) value.
176 /// Internal processing handles the switching between SAM/BAM formats
177 /// when read/written.
178 /// \param position 0-based start position
179 /// \return true if successfully set, false if not.
180 bool set0BasedMatePosition(int32_t matePosition);
181
182 /// Sets the inferred insert size (ISIZE)/observed template length (TLEN).
183 /// \param insertSize inferred insert size/observed template length.
184 /// \return true if successfully set, false if not.
185 bool setInsertSize(int32_t insertSize);
186
187 /// Sets the sequence (SEQ) to the specified SAM formatted sequence string.
188 /// Internal processing handles switching between SAM/BAM formats when
189 /// read/written.
190 /// \param seq SAM sequence string. May contain '='.
191 /// \return true if successfully set, false if not.
192 bool setSequence(const char* seq);
193
194 /// Sets the quality (QUAL) to the specified SAM formatted quality string.
195 /// Internal processing handles switching between SAM/BAM formats when
196 /// read/written.
197 /// \param quality SAM quality string.
198 /// \return true if successfully set, false if not.
199 bool setQuality(const char* quality);
200
201 /// Shift the indels (if any) to the left by updating the CIGAR.
202 /// \return true if the cigar was shifted, false if not.
203 bool shiftIndelsLeft();
204
205 /// Sets the SamRecord to contain the information in the BAM formatted
206 /// fromBuffer.
207 /// \param fromBuffer buffer to read the BAM record from.
208 /// \param fromBufferSize size of the buffer containing the BAM record.
209 /// \param header BAM header for the record.
210 /// \return status of reading the BAM record from the buffer.
211 SamStatus::Status setBuffer(const char* fromBuffer, uint32_t fromBufferSize,
212 SamFileHeader& header);
213
214 /// Read the BAM record from a file.
215 /// \param filePtr file to read the buffer from.
216 /// \param header BAM header for the record.
217 /// \return status of the reading the BAM record from the file.
219
220 //@}
221
222 ///////////////////////
223 /// @name Set Tag Data
224 /// Set methods for tags.
225 //@{
226
227 /// Add the specified integer tag to the record. Internal processing
228 /// handles switching between SAM/BAM formats when read/written and
229 /// determining the type for BAM format. If the tag is already there
230 /// this code will replace it if the specified value is different.
231 /// \param tag two character tag to be added to the SAM/BAM record.
232 /// \param value value for the specified tag.
233 /// \return true if the tag was successfully added, false otherwise.
234 bool addIntTag(const char* tag, int32_t value);
235
236 /// Add the specified tag,vtype,value to the record. Vtype can be SAM/BAM
237 /// format. Internal processing handles switching between SAM/BAM formats
238 /// when read/written. If the tag is already there this code will replace
239 /// it if the specified value is different.
240 /// \param tag two character tag to be added to the SAM/BAM record.
241 /// \param vtype vtype of the specified value - either SAM/BAM vtypes.
242 /// \param value value as a string for the specified tag.
243 /// \return true if the tag was successfully added, false otherwise.
244 bool addTag(const char* tag, char vtype, const char* value);
245
246 /// Clear the tags in this record.
247 /// Does not set SamStatus.
248 void clearTags();
249
250 /// Remove a tag.
251 /// \param tag tag to remove.
252 /// \param type of the tag to be removed.
253 /// \return true if the tag no longer exists in the record, false if it could not be removed (Returns true if the tag was not found in the record).
254 bool rmTag(const char* tag, char type);
255
256 /// Remove tags.
257 /// The delimiter between the tags is ',' or ';'. ',' was added since
258 /// the original delimiter, ';', requires the string to be quoted on the
259 /// command-line.
260 /// \param tags tags to remove, formatted as Tag:Type,Tag:Type,Tag:Type...
261 /// \return true if all tags no longer exist in the record, false if any could not be removed
262 /// (Returns true if the tags were not found in the record).
263 /// SamStatus is set to INVALID if the tags are incorrectly formatted.
264 bool rmTags(const char* tags);
265
266 //@}
267
268 ///////////////////////
269 /// @name Get Alignment Data
270 /// Get methods for record fields. All of the "get" methods set the
271 /// status to indicate success or the failure reason.
272 //@{
273
274 /// Get a const pointer to the buffer that contains the BAM representation
275 /// of the record.
276 /// \return const pointer to the buffer that contains the BAM representation
277 /// of the record.
278 const void* getRecordBuffer();
279
280 /// Get a const pointer to the buffer that contains the BAM representation
281 /// of the record using the specified translation on the sequence.
282 /// \param translation type of sequence translation to use.
283 /// \return const pointer to the buffer that contains the BAM representation
284 /// of the record.
285 const void* getRecordBuffer(SequenceTranslation translation);
286
287 /// Write the record as a BAM into the specified already opened file.
288 /// \param filePtr file to write the BAM record into.
289 /// \return status of the write.
291
292 /// Write the record as a BAM into the specified already opened file using
293 /// the specified translation on the sequence.
294 /// \param filePtr file to write the BAM record into.
295 /// \param translation type of sequence translation to use.
296 /// \return status of the write.
298 SequenceTranslation translation);
299
300 /// Get the block size of the record (BAM format).
301 /// \return BAM block size of the record.
302 int32_t getBlockSize();
303
304 /// Get the reference sequence name (RNAME) of the record.
305 /// \return reference sequence name
306 const char* getReferenceName();
307
308 /// Get the reference sequence id of the record (BAM format rid).
309 /// \return reference sequence id
310 int32_t getReferenceID();
311
312 /// Get the 1-based(SAM) leftmost position (POS) of the record.
313 /// \return 1-based leftmost position.
314 int32_t get1BasedPosition();
315
316 /// Get the 0-based(BAM) leftmost position of the record.
317 /// \return 0-based leftmost position.
318 int32_t get0BasedPosition();
319
320 /// Get the length of the readname (QNAME) including the null.
321 /// \return length of the read name (including null).
322 uint8_t getReadNameLength();
323
324 /// Get the mapping quality (MAPQ) of the record.
325 /// \return map quality.
326 uint8_t getMapQuality();
327
328 /// Get the BAM bin for the record.
329 /// \return BAM bin
330 uint16_t getBin();
331
332 /// Get the length of the BAM formatted CIGAR.
333 /// \return length of BAM formatted cigar.
334 uint16_t getCigarLength();
335
336 /// Get the flag (FLAG).
337 /// \return flag.
338 uint16_t getFlag();
339
340 /// Get the length of the read.
341 /// \return read length.
342 int32_t getReadLength();
343
344 /// Get the mate/next fragment's reference sequence name (RNEXT). If it
345 /// is equal to the reference name, it still returns the reference name.
346 /// \return reference sequence name
347 const char* getMateReferenceName();
348
349 /// Get the mate/next fragment's reference sequence name (RNEXT),
350 /// returning "=" if it is the same as the reference name, unless
351 /// they are both "*" in which case "*" is returned.
352 /// \return reference sequence name or '='
353 const char* getMateReferenceNameOrEqual();
354
355 /// Get the mate reference id of the record
356 /// (BAM format: mate_rid/next_refID).
357 /// \return reference id
358 int32_t getMateReferenceID();
359
360 /// Get the 1-based(SAM) leftmost mate/next fragment's position (PNEXT).
361 /// \return 1-based leftmost position.
362 int32_t get1BasedMatePosition();
363
364 /// Get the 0-based(BAM) leftmost mate/next fragment's position.
365 /// \return 0-based leftmost position.
366 int32_t get0BasedMatePosition();
367
368 /// Get the inferred insert size of the read pair (ISIZE) or
369 /// observed template length (TLEN).
370 /// \return inferred insert size or observed template length.
371 int32_t getInsertSize();
372
373 /// Returns the 0-based inclusive rightmost position of the
374 /// clipped sequence.
375 /// \return 0-based inclusive rightmost position
376 int32_t get0BasedAlignmentEnd();
377
378 /// Returns the 1-based inclusive rightmost position of the
379 /// clipped sequence.
380 /// \return 1-based inclusive rightmost position
381 int32_t get1BasedAlignmentEnd();
382
383 /// Returns the length of the clipped sequence, returning 0 if the cigar
384 /// is '*'.
385 /// \return length of the clipped sequence.
386 int32_t getAlignmentLength();
387
388 /// Returns the 0-based inclusive left-most position adjusted for
389 /// clipped bases.
390 /// \return 0-based inclusive leftmost position including clips.
391 int32_t get0BasedUnclippedStart();
392
393 /// Returns the 1-based inclusive left-most position adjusted for
394 /// clipped bases.
395 /// \return 1-based inclusive leftmost position including clips.
396 int32_t get1BasedUnclippedStart();
397
398 /// Returns the 0-based inclusive right-most position adjusted for
399 /// clipped bases.
400 /// \return 0-based inclusive rightmost position including clips.
401 int32_t get0BasedUnclippedEnd();
402
403 /// Returns the 1-based inclusive right-most position adjusted for
404 /// clipped bases.
405 /// \return 1-based inclusive rightmost position including clips.
406 int32_t get1BasedUnclippedEnd();
407
408 /// Returns the SAM formatted Read Name (QNAME).
409 /// \return read name.
410 const char* getReadName();
411
412 /// Returns the SAM formatted CIGAR string.
413 /// \return cigar string.
414 const char* getCigar();
415
416 /// Returns the SAM formatted sequence string (SEQ), translating the base as
417 /// specified by setSequenceTranslation.
418 /// \return sequence string.
419 const char* getSequence();
420
421 /// Returns the SAM formatted sequence string (SEQ) performing the specified
422 /// sequence translation.
423 /// \param translation type of sequence translation to use.
424 /// \return sequence string.
425 const char* getSequence(SequenceTranslation translation);
426
427 /// Returns the SAM formatted quality string (QUAL).
428 /// \return quality string.
429 const char* getQuality();
430
431 /// Get the sequence base at the specified index into this sequence 0 to
432 /// readLength - 1, translating the base as specified by
433 /// setSequenceTranslation. Throws an exception if index is out of range.
434 /// \param index index into the sequence string (0 to readLength-1).
435 /// \return the sequence base at the specified index into the sequence.
436 char getSequence(int index);
437
438 /// Get the sequence base at the specified index into this sequence 0 to
439 /// readLength - 1 performing the specified sequence translation.
440 /// Throws an exception if index is out of range.
441 /// \param index index into the sequence string (0 to readLength-1).
442 /// \param translation type of sequence translation to use.
443 /// \return the sequence base at the specified index into the sequence.
444 char getSequence(int index, SequenceTranslation translation);
445
446 /// Get the quality character at the specified index into the quality 0 to
447 /// readLength - 1. Throws an exception if index is out of range.
448 /// \param index index into the quality string (0 to readLength-1).
449 /// \return the quality character at the specified index into the quality.
450 char getQuality(int index);
451
452 /// Returns a pointer to the Cigar object associated with this record.
453 /// The object is essentially read-only, only allowing modifications
454 /// due to lazy evaluations.
455 /// \return pointer to the Cigar object.
457
458 /// Return the number of bases in this read that overlap the passed in
459 /// region. Matches & mismatches between the read and the reference
460 /// are counted as overlaps, but insertions, deletions, skips, clips, and
461 /// pads are not counted.
462 /// \param start inclusive 0-based start position (reference position) of
463 /// the region to check for overlaps in.
464 /// (-1 indicates to start at the beginning of the reference.)
465 /// \param end exclusive 0-based end position (reference position) of the
466 /// region to check for overlaps in.
467 /// (-1 indicates to go to the end of the reference.)
468 /// \return number of overlapping bases
469 uint32_t getNumOverlaps(int32_t start, int32_t end);
470
471 /// Returns the values of all fields except the tags.
472 /// \param recStruct structure containing the contents of all
473 /// non-variable length fields.
474 /// \param readName read name from the record (return param)
475 /// \param cigar cigar string from the record (return param)
476 /// \param sequence sequence string from the record (return param)
477 /// \param quality quality string from the record (return param)
478 /// \return true if all fields were successfully set, false otherwise.
479 bool getFields(bamRecordStruct& recStruct, String& readName,
480 String& cigar, String& sequence, String& quality);
481
482 /// Returns the values of all fields except the tags using the specified
483 /// sequence translation.
484 /// \param recStruct structure containing the contents of all
485 /// non-variable length fields.
486 /// \param readName read name from the record (return param)
487 /// \param cigar cigar string from the record (return param)
488 /// \param sequence sequence string from the record (return param)
489 /// \param quality quality string from the record (return param)
490 /// \param translation type of sequence translation to use.
491 /// \return true if all fields were successfully set, false otherwise.
492 bool getFields(bamRecordStruct& recStruct, String& readName,
493 String& cigar, String& sequence, String& quality,
494 SequenceTranslation translation);
495
496 /// Returns a pointer to the genome sequence object associated with this
497 /// record if it was set (NULL if it was not set).
498 /// \return pointer to the GenomeSequence object or NULL if there isn't one.
500
501 //@}
502
503 ///////////////////////
504 /// @name Get Tag Methods
505 /// Get methods for obtaining information on tags.
506 //@{
507
508 /// Returns the length of the BAM formatted tags.
509 /// \return length of the BAM formatted tags.
510 uint32_t getTagLength();
511
512 /// Get the next tag from the record.
513 /// Sets the Status to SUCCESS when a tag is successfully returned or
514 /// when there are no more tags. Otherwise the status is set to describe
515 /// why it failed (parsing, etc).
516 /// \param tag set to the tag when a tag is read.
517 /// \param vtype set to the vtype when a tag is read.
518 /// \param value pointer to the value of the tag (will need to cast
519 /// to int, float, char, or string based on vtype).
520 /// \return true if a tag was read, false if there are no more tags.
521 bool getNextSamTag(char* tag, char& vtype, void** value);
522
523 /// Reset the tag iterator to the beginning of the tags.
524 void resetTagIter();
525
526 /// Returns whether or not the specified vtype is an integer type.
527 /// Does not set SamStatus.
528 /// \param vtype value type to check.
529 /// \return true if the passed in vtype is an integer ('c', 'C', 's',
530 /// 'S', 'i', 'I'), false otherwise.
531 static bool isIntegerType(char vtype);
532
533 /// Returns whether or not the specified vtype is a float type.
534 /// Does not set SamStatus.
535 /// \param vtype value type to check.
536 /// \return true if the passed in vtype is a float ('f'), false otherwise.
537 static bool isFloatType(char vtype);
538
539 /// Returns whether or not the specified vtype is a char type.
540 /// Does not set SamStatus.
541 /// \param vtype value type to check.
542 /// \return true if the passed in vtype is a char ('A'), false otherwise.
543 static bool isCharType(char vtype);
544
545 /// Returns whether or not the specified vtype is a string type.
546 /// Does not set SamStatus.
547 /// \param vtype value type to check.
548 /// \return true if the passed in vtype is a string ('Z'/'B'), false othwerise.
549 static bool isStringType(char vtype);
550
551 /// Get the string representation of the tags from the record, formatted
552 /// as TAG:TYPE:VALUE<delim>TAG:TYPE:VALUE...
553 /// Sets the Status to SUCCESS when the tags are successfully returned or
554 /// the tags were not found. If a different error occured, the status is
555 /// set appropriately.
556 /// The delimiter between the tags to retrieve is ',' or ';'. ',' was added
557 /// since the original delimiter, ';', requires the string to be quoted on
558 /// the command-line.
559 /// \param tags the tags to retrieve, formatted as TAG:TYPE,TAG:TYPE...
560 /// \param returnString the String to set (this method first clears returnString)
561 /// to TAG:TYPE:VALUE<delim>TAG:TYPE:VALUE...
562 /// \param delim delimiter to use to separate two tags, default is a tab.
563 /// \return true if there were not any errors even if no tags were found.
564 bool getTagsString(const char* tags, String& returnString, char delim = '\t');
565
566 /// Get the string value for the specified tag.
567 /// \param tag tag to retrieve
568 /// \param pointer to the tag's string value if found, NULL if not found.
569 const String* getStringTag(const char * tag);
570
571 /// Get the integer value for the specified tag, DEPRECATED, use one that returns a bool (success/failure).
572 /// \param tag tag to retrieve
573 /// \retun pointer to the tag's integer value if found, NULL if not found.
574 int* getIntegerTag(const char * tag);
575
576 /// Get the integer value for the specified tag.
577 /// \param tag tag to retrieve
578 /// \param tagVal return parameter with integer value for the tag
579 /// \retun bool true if Integer tag was found and tagVal was set,
580 /// false if not.
581 bool getIntegerTag(const char * tag, int& tagVal);
582
583 /// Get the float value for the specified tag.
584 /// \param tag tag to retrieve
585 /// \param tagVal return parameter with integer value for the tag
586 /// \return bool true if Float tag was found and tagVal was set,
587 /// false if not.
588 bool getFloatTag(const char * tag, float& tagVal);
589
590 /// Get the string value for the specified tag.
591 const String & getString(const char * tag);
592
593 /// Get the integer value for the specified tag, DEPRECATED, use getIntegerTag that returns a bool.
594 int & getInteger(const char * tag);
595
596 /// Check if the specified tag contains a string.
597 /// Does not set SamStatus.
598 /// \param tag SAM tag to check contents of.
599 /// \return true if the value associated with the tag is a string.
600 bool checkString(const char * tag)
601 { return(checkTag(tag, 'Z') || checkTag(tag, 'B')); }
602
603 /// Check if the specified tag contains an integer.
604 /// Does not set SamStatus.
605 /// \param tag SAM tag to check contents of.
606 /// \return true if the value associated with the tag is a string.
607 bool checkInteger(const char * tag) { return checkTag(tag, 'i'); }
608
609 /// Check if the specified tag contains a string.
610 /// Does not set SamStatus.
611 /// \param tag SAM tag to check contents of.
612 /// \return true if the value associated with the tag is a string.
613 bool checkFloat(const char * tag) { return checkTag(tag, 'f'); }
614
615 /// Check if the specified tag contains a value of the specified vtype.
616 /// Does not set SamStatus.
617 /// \param tag SAM tag to check contents of.
618 /// \param type value type to check if the SAM tag matches.
619 /// \return true if the value associated with the tag is a string.
620 bool checkTag(const char * tag, char type);
621 //@}
622
623 /// Returns the status associated with the last method that sets the status.
624 /// \return SamStatus of the last command that sets status.
625 const SamStatus& getStatus();
626
627
628private:
629 static int MAKEKEY(char ch1, char ch2, char type)
630 { return (getKeyType(type) << 16) + (ch2 << 8) + ch1; }
631
632 static char getKeyType(char type)
633 {
634 switch(type)
635 {
636 // For any char/integer type, return 'i'
637 case 'A' :
638 case 'c' :
639 case 'C' :
640 case 's' :
641 case 'S' :
642 case 'i' :
643 case 'I' :
644 return('i');
645 break;
646 default:
647 // For all other types, return the actual type.
648 return(type);
649 };
650 }
651
652 static inline int getNumericTagTypeSize(char type)
653 {
654 switch(type)
655 {
656 case 'A':
657 case 'c':
658 case 'C':
659 return(1);
660 break;
661 case 's':
662 case 'S':
663 return(2);
664 break;
665 case 'i':
666 case 'I':
667 case 'f':
668 return(4);
669 default:
670 // Not a numeric type.
671 return(0);
672 }
673 }
674
675 // Allocate space for the record - does a realloc.
676 // The passed in size is the size of the entire record including the
677 // block size field.
678 // Adds any errors to myStatus.
679 bool allocateRecordStructure(int size);
680
681 void* getStringPtr(int offset);
682 void* getIntegerPtr(int offset, char& vtype);
683 void* getFloatPtr(int offset);
684
685 // Fixes the buffer to match the variable length fields.
686 // Adds any errors to myStatus.
687 bool fixBuffer(SequenceTranslation translation);
688
689 // Sets the Sequence and Quality strings from the buffer.
690 // They are done together in one method because they require the same
691 // loop, so might as well be done at the same time.
692 // Adds any errors to myStatus.
693 void setSequenceAndQualityFromBuffer();
694
695 // Parse the cigar to calculate the alignment/unclipped ends and convert
696 // to SAM/BAM format.
697 // Adds any errors to myStatus.
698 bool parseCigar();
699 // Parse the cigar string to calculate the cigar length and alignment end
700 // and convert to SAM format.
701 // Adds any errors to myStatus.
702 bool parseCigarBinary();
703 // Parse the cigar string to calculate the cigar length and alignment end
704 // and convert to BAM format.
705 // Adds any errors to myStatus.
706 bool parseCigarString();
707
708 // Set the tags from the buffer.
709 // Adds any errors to myStatus.
710 bool setTagsFromBuffer();
711
712 // Set the tags in the buffer.
713 // Adds any errors to myStatus.
714 bool setTagsInBuffer();
715
716 void setVariablesForNewBuffer(SamFileHeader& header);
717
718 void getTypeFromKey(int key, char& type) const;
719 void getTag(int key, char* tag) const;
720
721 String & getString(int offset);
722 int & getInteger(int offset);
723 const char & getIntegerType(int offset) const;
724 float & getFloat(int offset);
725
726 // Append the string representation of the value at the specified index
727 // of the int array.
728 inline void appendIntArrayValue(int index, String& strVal) const
729 {
730 appendIntArrayValue(intType[index], integers[index], strVal);
731 }
732
733 void appendIntArrayValue(char type, int value, String& strVal) const;
734
735 int getBtagBufferSize(String& tagStr);
736 int setBtagBuffer(String& tagStr, char* extraPtr);
737 int getStringFromBtagBuffer(unsigned char* buffer, String& tagStr);
738
739 static const int DEFAULT_BLOCK_SIZE = 40;
740 static const int DEFAULT_BIN = 4680;
741 static const int DEFAULT_READ_NAME_LENGTH = 8;
742 static const char* DEFAULT_READ_NAME;
743 static const char* FIELD_ABSENT_STRING;
744
745 bamRecordStruct * myRecordPtr;
746 int allocatedSize;
747
748 // Pointer to a temporary cigar buffer that can be used during string
749 // parsing before it is ready to be copied into the actual record.
750 uint32_t* myCigarTempBuffer;
751
752 // Size of the currently allocated temporary cigar buffer.
753 int myCigarTempBufferAllocatedSize;
754
755 // Length of the cigar currently contained in the temporary buffer.
756 int myCigarTempBufferLength;
757
758 // Track if the buffer is in sync with the Strings/Tags.
759 // Set to false if any of the variable length fields are modified.
760 // Set to true when the buffer is updated to match the variable length
761 // fields.
762 bool myIsBufferSynced;
763
764 // Track if the tags need to be set from the buffer.
765 bool myNeedToSetTagsFromBuffer;
766
767 // Trag if the tags need to be set in the buffer.
768 // Allows you to set just the tags if they are the only thing that changed
769 // in the buffer.
770 bool myNeedToSetTagsInBuffer;
771
772 int myTagBufferSize;
773 int myLastTagIndex;
774
775 String myReadName;
776 String myReferenceName;
777 String myMateReferenceName;
778 String myCigar;
779 String mySequence;
780 String myQuality;
781
782 std::string mySeqWithEq;
783 std::string mySeqWithoutEq;
784
785 // The length of the alignment.
786 int32_t myAlignmentLength;
787 // Unclipped alignment positions.
788 int32_t myUnclippedStartOffset;
789 int32_t myUnclippedEndOffset;
790
791 CigarRoller myCigarRoller;
792
793 LongHash<int> extras;
794 // Note: not all values in strings, integers, and floats are always
795 // in extras. They will not be if the tags were removed. Removed
796 // tags are removed from extras, but not from strings, integers, or floats
797 // since if one was removed from these arrays, all other entries would
798 // need their indices updated in extras.
799 StringArray strings;
800 IntArray integers;
801 std::vector<char> intType; // contains the type of int at same position in integers.
802 std::vector<float> floats;
803
804
805 // Track whether or not the buffer values are correct for
806 // each setting.
807 bool myIsReadNameBufferValid;
808 bool myIsCigarBufferValid;
809 bool myIsSequenceBufferValid;
810 bool myIsQualityBufferValid;
811 bool myIsTagsBufferValid;
812 bool myIsBinValid;
813
814 unsigned char* myPackedSequence;
815 unsigned char* myPackedQuality;
816
817
818 SamStatus myStatus;
819
820 // The current translation of the sequence as it occurs in the buffer.
821 // Only applicable if myIsSequenceBufferValid == true.
822 SequenceTranslation myBufferSequenceTranslation;
823
824
825 // Track the Reference.
826 GenomeSequence* myRefPtr;
827
828 // The type of translation to do when getting a sequence.
829 SequenceTranslation mySequenceTranslation;
830
831 String NOT_FOUND_TAG_STRING;
832 int NOT_FOUND_TAG_INT;
833
834 static const int myMaxWarns = 5;
835 static int myNumWarns;
836};
837
838#endif
The purpose of this class is to provide accessors for setting, updating, modifying the CIGAR object....
Definition CigarRoller.h:67
This class represents the CIGAR without any methods to set the cigar (see CigarRoller for that).
Definition Cigar.h:84
HandlingType
This specifies how this class should respond to errors.
Create/Access/Modify/Load Genome Sequences stored as binary mapped files.
Class for easily reading/writing files without having to worry about file type (uncompressed,...
Definition InputFile.h:37
This class allows a user to get/set the fields in a SAM/BAM Header.
Class providing an easy to use interface to get/set/operate on the fields in a SAM/BAM record.
Definition SamRecord.h:52
int32_t getBlockSize()
Get the block size of the record (BAM format).
uint16_t getCigarLength()
Get the length of the BAM formatted CIGAR.
const char * getReferenceName()
Get the reference sequence name (RNAME) of the record.
SequenceTranslation
Enum containing the settings on how to translate the sequence if a reference is available.
Definition SamRecord.h:57
@ NONE
Leave the sequence as is.
Definition SamRecord.h:58
@ BASES
Translate '=' to the actual base.
Definition SamRecord.h:60
@ EQUAL
Translate bases that match the reference to '='.
Definition SamRecord.h:59
bool setReadName(const char *readName)
Set QNAME to the passed in name.
int32_t getInsertSize()
Get the inferred insert size of the read pair (ISIZE) or observed template length (TLEN).
bool checkString(const char *tag)
Check if the specified tag contains a string.
Definition SamRecord.h:600
int32_t get0BasedMatePosition()
Get the 0-based(BAM) leftmost mate/next fragment's position.
int32_t get1BasedPosition()
Get the 1-based(SAM) leftmost position (POS) of the record.
void clearTags()
Clear the tags in this record.
bool addIntTag(const char *tag, int32_t value)
Add the specified integer tag to the record.
int32_t getReferenceID()
Get the reference sequence id of the record (BAM format rid).
bool getTagsString(const char *tags, String &returnString, char delim='\t')
Get the string representation of the tags from the record, formatted as TAG:TYPE:VALUE<delim>TAG:TYPE...
GenomeSequence * getReference()
Returns a pointer to the genome sequence object associated with this record if it was set (NULL if it...
int32_t getAlignmentLength()
Returns the length of the clipped sequence, returning 0 if the cigar is '*'.
int & getInteger(const char *tag)
Get the integer value for the specified tag, DEPRECATED, use getIntegerTag that returns a bool.
bool setInsertSize(int32_t insertSize)
Sets the inferred insert size (ISIZE)/observed template length (TLEN).
int32_t get1BasedAlignmentEnd()
Returns the 1-based inclusive rightmost position of the clipped sequence.
uint32_t getTagLength()
Returns the length of the BAM formatted tags.
SamRecord()
Default Constructor.
Definition SamRecord.cpp:34
static bool isIntegerType(char vtype)
Returns whether or not the specified vtype is an integer type.
bool rmTag(const char *tag, char type)
Remove a tag.
bool setMateReferenceName(SamFileHeader &header, const char *mateReferenceName)
Set the mate/next fragment's reference sequence name (RNEXT) to the specified name,...
uint8_t getReadNameLength()
Get the length of the readname (QNAME) including the null.
bool checkFloat(const char *tag)
Check if the specified tag contains a string.
Definition SamRecord.h:613
Cigar * getCigarInfo()
Returns a pointer to the Cigar object associated with this record.
bool getFloatTag(const char *tag, float &tagVal)
Get the float value for the specified tag.
SamStatus::Status writeRecordBuffer(IFILE filePtr)
Write the record as a BAM into the specified already opened file.
const char * getMateReferenceNameOrEqual()
Get the mate/next fragment's reference sequence name (RNEXT), returning "=" if it is the same as the ...
bool setMapQuality(uint8_t mapQuality)
Set the mapping quality (MAPQ).
static bool isFloatType(char vtype)
Returns whether or not the specified vtype is a float type.
SamStatus::Status setBuffer(const char *fromBuffer, uint32_t fromBufferSize, SamFileHeader &header)
Sets the SamRecord to contain the information in the BAM formatted fromBuffer.
int32_t get1BasedUnclippedStart()
Returns the 1-based inclusive left-most position adjusted for clipped bases.
bool addTag(const char *tag, char vtype, const char *value)
Add the specified tag,vtype,value to the record.
uint16_t getBin()
Get the BAM bin for the record.
bool isValid(SamFileHeader &header)
Returns whether or not the record is valid, setting the status to indicate success or failure.
int32_t getMateReferenceID()
Get the mate reference id of the record (BAM format: mate_rid/next_refID).
bool getFields(bamRecordStruct &recStruct, String &readName, String &cigar, String &sequence, String &quality)
Returns the values of all fields except the tags.
bool set0BasedMatePosition(int32_t matePosition)
Set the mate/next fragment's leftmost position using the specified 0-based (BAM format) value.
void resetRecord()
Reset the fields of the record to a default value.
Definition SamRecord.cpp:91
bool setFlag(uint16_t flag)
Set the bitwise FLAG to the specified value.
bool set1BasedPosition(int32_t position)
Set the leftmost position (POS) using the specified 1-based (SAM format) value.
SamStatus::Status setBufferFromFile(IFILE filePtr, SamFileHeader &header)
Read the BAM record from a file.
uint16_t getFlag()
Get the flag (FLAG).
const void * getRecordBuffer()
Get a const pointer to the buffer that contains the BAM representation of the record.
void setSequenceTranslation(SequenceTranslation translation)
Set the type of sequence translation to use when getting the sequence.
bool checkInteger(const char *tag)
Check if the specified tag contains an integer.
Definition SamRecord.h:607
int32_t get1BasedMatePosition()
Get the 1-based(SAM) leftmost mate/next fragment's position (PNEXT).
int32_t get0BasedUnclippedEnd()
Returns the 0-based inclusive right-most position adjusted for clipped bases.
bool shiftIndelsLeft()
Shift the indels (if any) to the left by updating the CIGAR.
int * getIntegerTag(const char *tag)
Get the integer value for the specified tag, DEPRECATED, use one that returns a bool (success/failure...
const SamStatus & getStatus()
Returns the status associated with the last method that sets the status.
static bool isCharType(char vtype)
Returns whether or not the specified vtype is a char type.
bool setCigar(const char *cigar)
Set the CIGAR to the specified SAM formatted cigar string.
int32_t get1BasedUnclippedEnd()
Returns the 1-based inclusive right-most position adjusted for clipped bases.
uint32_t getNumOverlaps(int32_t start, int32_t end)
Return the number of bases in this read that overlap the passed in region.
const char * getMateReferenceName()
Get the mate/next fragment's reference sequence name (RNEXT).
bool checkTag(const char *tag, char type)
Check if the specified tag contains a value of the specified vtype.
bool getNextSamTag(char *tag, char &vtype, void **value)
Get the next tag from the record.
void setReference(GenomeSequence *reference)
Set the reference to the specified genome sequence object.
bool setSequence(const char *seq)
Sets the sequence (SEQ) to the specified SAM formatted sequence string.
int32_t get0BasedUnclippedStart()
Returns the 0-based inclusive left-most position adjusted for clipped bases.
int32_t getReadLength()
Get the length of the read.
int32_t get0BasedAlignmentEnd()
Returns the 0-based inclusive rightmost position of the clipped sequence.
const String * getStringTag(const char *tag)
Get the string value for the specified tag.
bool set1BasedMatePosition(int32_t matePosition)
Set the mate/next fragment's leftmost position (PNEXT) using the specified 1-based (SAM format) value...
int32_t get0BasedPosition()
Get the 0-based(BAM) leftmost position of the record.
const char * getCigar()
Returns the SAM formatted CIGAR string.
uint8_t getMapQuality()
Get the mapping quality (MAPQ) of the record.
const String & getString(const char *tag)
Get the string value for the specified tag.
bool set0BasedPosition(int32_t position)
Set the leftmost position using the specified 0-based (BAM format) value.
const char * getReadName()
Returns the SAM formatted Read Name (QNAME).
void resetTagIter()
Reset the tag iterator to the beginning of the tags.
bool setQuality(const char *quality)
Sets the quality (QUAL) to the specified SAM formatted quality string.
bool setReferenceName(SamFileHeader &header, const char *referenceName)
Set the reference sequence name (RNAME) to the specified name, using the header to determine the refe...
const char * getQuality()
Returns the SAM formatted quality string (QUAL).
~SamRecord()
Destructor.
Definition SamRecord.cpp:72
const char * getSequence()
Returns the SAM formatted sequence string (SEQ), translating the base as specified by setSequenceTran...
bool rmTags(const char *tags)
Remove tags.
static bool isStringType(char vtype)
Returns whether or not the specified vtype is a string type.
This class is used to track the status results of some methods in the BAM classes.
Status
Return value enum for StatGenFile methods.
Structure of a BAM record.
Definition SamRecord.h:34