libStatGen Software 1
Loading...
Searching...
No Matches
GenomeSequence Class Reference

Create/Access/Modify/Load Genome Sequences stored as binary mapped files. More...

#include <GenomeSequence.h>

Inheritance diagram for GenomeSequence:
Collaboration diagram for GenomeSequence:

Public Member Functions

 GenomeSequence ()
 Simple constructor - no implicit file open.
 
void constructorClear ()
 
 GenomeSequence (std::string &referenceFilename)
 attempt to open an existing sequence
 
 GenomeSequence (const char *referenceFilename)
 Smarter constructor - attempt to open an existing sequence.
 
 ~GenomeSequence ()
 Close the file if open and destroy the object.
 
bool open (bool isColorSpace=false, int flags=O_RDONLY)
 open the reference specified using GenomeSequence::setReferenceName
 
bool open (const char *filename, int flags=O_RDONLY)
 open the given file as the genome (no filename munging occurs).
 
bool create (bool isColor=false)
 
void setProgressStream (std::ostream &progressStream)
 if set, then show progress when creating and pre-fetching
 
void setColorSpace (bool colorSpace)
 
void setSearchCommonFileSuffix (bool searchCommonFileSuffix)
 
void setCreateOverwrite (bool createOverwrite)
 
bool loadFastaData (const char *filename)
 
bool setReferenceName (std::string referenceFilename)
 set the reference name that will be used in open()
 
void setApplication (std::string application)
 set the application name in the binary file header
 
const std::string & getFastaName () const
 
const std::string & getReferenceName () const
 
bool isColorSpace () const
 tell us if we are a color space reference or not
 
genomeIndex_t getNumberBases () const
 return the number of bases represented in this reference
 
int getChromosome (genomeIndex_t position) const
 given a whole genome index, get the chromosome it is located in
 
int getChromosome (const char *chromosomeName) const
 given a chromosome name, return the chromosome index
 
int getChromosomeCount () const
 Return the number of chromosomes in the genome.
 
genomeIndex_t getChromosomeStart (int chromosomeIndex) const
 given a chromosome, return the genome base it starts in
 
genomeIndex_t getChromosomeSize (int chromosomeIndex) const
 given a chromosome, return its size in bases
 
genomeIndex_t getGenomePosition (const char *chromosomeName, unsigned int chromosomeIndex) const
 given a chromosome name and position, return the genome position
 
genomeIndex_t getGenomePosition (int chromosome, unsigned int chromosomeIndex) const
 given a chromosome index and position, return the genome position
 
genomeIndex_t getGenomePosition (const char *chromosomeName) const
 given the chromosome name, get the corresponding 0 based genome index for the start of that chromosome
 
genomeIndex_t getGenomePosition (int chromosomeIndex) const
 
const std::string & getBaseFilename () const
 
const char * getChromosomeName (int chromosomeIndex) const
 
void setDebugFlag (bool d)
 
genomeIndex_t sequenceLength () const
 
const char * chromosomeName (int chr) const
 
void sanityCheck (MemoryMap &fasta) const
 
std::string IntegerToSeq (unsigned int n, unsigned int wordsize) const
 
bool wordMatch (unsigned int index, std::string &word) const
 
bool printNearbyWords (unsigned int index, unsigned int variance, std::string &word) const
 
char BasePair (char c) const
 
void dumpSequenceSAMDictionary (std::ostream &) const
 
void dumpHeaderTSV (std::ostream &) const
 
char operator[] (genomeIndex_t index) const
 Return the bases in base space or color space for within range index, ot.
 
char getBase (const char *chromosomeName, unsigned int chromosomeIndex) const
 given a chromosome name and 1-based position, return the reference base.
 
uint8_t getInteger (genomeIndex_t index) const
 
void set (genomeIndex_t index, char value)
 
uint8_t * getDataPtr (genomeIndex_t index)
 obtain the pointer to the raw data for other access methods
 
void getReverseRead (std::string &read)
 
void getReverseRead (String &read)
 
int debugPrintReadValidation (std::string &read, std::string &quality, char direction, genomeIndex_t readLocation, int sumQuality, int mismatchCount, bool recurse=true)
 
void getString (std::string &str, int chromosome, uint32_t index, int baseCount) const
 
void getString (String &str, int chromosome, uint32_t index, int baseCount) const
 
void getString (std::string &str, genomeIndex_t index, int baseCount) const
 
void getString (String &str, genomeIndex_t index, int baseCount) const
 
void getHighLightedString (std::string &str, genomeIndex_t index, int baseCount, genomeIndex_t highLightStart, genomeIndex_t highLightEnd) const
 
void print30 (genomeIndex_t) const
 
genomeIndex_t simpleLocalAligner (std::string &read, std::string &quality, genomeIndex_t index, int windowSize) const
 
int getMismatchCount (std::string &read, genomeIndex_t location, char exclude='\0') const
 Return the mismatch count, disregarding CIGAR strings.
 
int getSumQ (std::string &read, std::string &qualities, genomeIndex_t location) const
 brute force sumQ - no sanity checking
 
void getMismatchHatString (std::string &result, const std::string &read, genomeIndex_t location) const
 
void getMismatchString (std::string &result, const std::string &read, genomeIndex_t location) const
 
void getChromosomeAndIndex (std::string &, genomeIndex_t) const
 
void getChromosomeAndIndex (String &, genomeIndex_t) const
 
bool checkRead (std::string &read, std::string &qualities, std::string &cigar, int &sumQ, int &gapOpenCount, int &gapExtendCount, int &gapDeleteCount, std::string &result) const
 check a SAM format read, using phred quality scores and the CIGAR string to determine if it is correct.
 
bool populateDBSNP (mmapArrayBool_t &dbSNP, IFILE inputFile) const
 
bool loadDBSNP (mmapArrayBool_t &dbSNP, const char *inputFileName) const
 user friendly dbSNP loader.
 
- Public Member Functions inherited from MemoryMapArray< elementT, indexT, cookieVal, versionVal, accessorFunc, setterFunc, elementCount2BytesFunc, arrayHeaderClass >
void constructorClear ()
 
const std::string & getErrorString ()
 
arrayHeaderClass & getHeader ()
 
void setContentCookie (uint32_t c)
 
void setContentVersion (uint32_t v)
 
elementT operator[] (indexT i)
 
void set (indexT i, elementT v)
 
int create (const char *file, indexT elementCount, int optionalHeaderCount=0)
 Create a vector with elementCount memebers.
 
int create (indexT elementCount, int optionalHeaderCount=0)
 allow anonymous (malloc) create.
 
bool open (const char *file, int flags=O_RDONLY)
 open a previously created mapped vector
 
bool close ()
 
void debugPrint (FILE *f)
 
size_t getElementCount () const
 
- Public Member Functions inherited from MemoryMap
void debug_print ()
 
void constructor_clear ()
 
void destructor_clear ()
 
virtual bool allocate ()
 
virtual bool create (const char *file, size_t size)
 create the memory mapped file on disk
 
virtual bool create (size_t size)
 store in allocated memory (malloc), not mmap:
 
bool close ()
 
void test ()
 
size_t length ()
 
char operator[] (unsigned int index)
 
int prefetch ()
 
void useMemoryMap (bool flag=true)
 

Additional Inherited Members

- Public Attributes inherited from MemoryMap
void * data
 
- Protected Attributes inherited from MemoryMapArray< elementT, indexT, cookieVal, versionVal, accessorFunc, setterFunc, elementCount2BytesFunc, arrayHeaderClass >
arrayHeaderClass * header
 
char * data
 
std::string errorStr
 

Detailed Description

Create/Access/Modify/Load Genome Sequences stored as binary mapped files.

GenomeSequence is designed to be a high performance shared access reference object.

It is implemented as a MemoryMapArray template object with unsigned 8 bit ints, each of which stores two bases. Although 2 bits could be used, most references have more than four symbols (usually at least including 'N', indicating an unknown or masked out base).

Normal use of this class follows these steps:

  1. create the reference
    1. instantiate the GenomeSequence class object
    2. create the actual file (memory mapped) that is to hold the data
    3. populate the data using GenomeSequence::set
  2. use the reference
    1. use the reference by instantiating a GenomeSequence object
    2. either use the constructor with the reference filename
    3. or use GenomeSequence::setReferenceName() followed by open
    4. access the bases via the overloaded array operator []
    5. check sequence length by using GenomeSequence::getNumberBases()
  3. accessing chromosomes in the reference
    1. you typically will need to know about the chromosomes in the sequence
    2. see methods and docs with prefix 'getChromosome'

Sharing is accomplished using the mmap() function via the MemoryMap base class. This allows a potentially large genome reference to be shared among a number of simultaneously executing instances of one or more programs sharing the same reference.

Definition at line 99 of file GenomeSequence.h.

Constructor & Destructor Documentation

◆ GenomeSequence() [1/3]

GenomeSequence::GenomeSequence ( )

Simple constructor - no implicit file open.

Definition at line 139 of file GenomeSequence.cpp.

140{
141 constructorClear();
142}

◆ GenomeSequence() [2/3]

GenomeSequence::GenomeSequence ( std::string &  referenceFilename)
inline

attempt to open an existing sequence

Parameters
referenceFilenamethe name of the reference fasta file to open
debugif true, additional debug information is printed

Definition at line 128 of file GenomeSequence.h.

129 {
130 constructorClear();
131 setup(referenceFilename.c_str());
132 }

◆ GenomeSequence() [3/3]

GenomeSequence::GenomeSequence ( const char *  referenceFilename)
inline

Smarter constructor - attempt to open an existing sequence.

Parameters
referenceFilenamethe name of the reference fasta file to open
debugif true, additional debug information is printed

Definition at line 138 of file GenomeSequence.h.

139 {
140 constructorClear();
141 setup(referenceFilename);
142 }

◆ ~GenomeSequence()

GenomeSequence::~GenomeSequence ( )

Close the file if open and destroy the object.

Definition at line 169 of file GenomeSequence.cpp.

170{
171 // free up resources:
172 _umfaFile.close();
173}

Member Function Documentation

◆ BasePair()

char GenomeSequence::BasePair ( char  c) const
inline

Definition at line 319 of file GenomeSequence.h.

320 {
321 return BaseAsciiMap::base2complement[(int) c];
322 }
static unsigned char base2complement[]
This table maps 5' base space to the 3' complement base space values, as well as 5' color space value...

◆ checkRead()

bool GenomeSequence::checkRead ( std::string &  read,
std::string &  qualities,
std::string &  cigar,
int &  sumQ,
int &  gapOpenCount,
int &  gapExtendCount,
int &  gapDeleteCount,
std::string &  result 
) const

check a SAM format read, using phred quality scores and the CIGAR string to determine if it is correct.

Parameters
readthe read in base space
qualitiesthe phred encoded qualities (Sanger, not Illumina)
cigarthe SAM file CIGAR column
sumQif >0 on entry, is checked against the computed sumQ
insertionscount of insertions found in

◆ chromosomeName()

const char * GenomeSequence::chromosomeName ( int  chr) const
inline

Definition at line 305 of file GenomeSequence.h.

306 {
307 return header->_chromosomes[chr].name;
308 }

◆ constructorClear()

void GenomeSequence::constructorClear ( )

Definition at line 144 of file GenomeSequence.cpp.

145{
146 _debugFlag = 0;
147 _progressStream = NULL;
148 _colorSpace = false;
149 _createOverwrite = false;
150}

◆ create()

bool GenomeSequence::create ( bool  isColor = false)

Definition at line 488 of file GenomeSequence.cpp.

489{
490 setColorSpace(isColor);
491
492 if (_baseFilename=="")
493 {
494 std::cerr << "Base reference filename is empty." << std::endl;
495 return true;
496 }
497
498 if (isColorSpace())
499 {
500 _umfaFilename = _baseFilename + "-cs.umfa";
501 }
502 else
503 {
504 _umfaFilename = _baseFilename + "-bs.umfa";
505 }
506
507 if (!_createOverwrite && access(_umfaFilename.c_str(), R_OK) == 0)
508 {
509 std::cerr << "Output file '" << _umfaFilename << "' exists or is not writable - please remove." << std::endl;
510 return true;
511 }
512
513 MemoryMap fastaFile;
514
515 if (fastaFile.open(_fastaFilename.c_str()))
516 {
517 std::cerr << "failed to open input fasta file '" << _fastaFilename << "'." << std::endl;
518 return true;
519 }
520
521 std::cerr << "Creating FASTA "
522 << (isColorSpace() ? "color space " : "")
523 << "binary cache file '"
524 << _umfaFilename
525 << "'."
526 << std::endl;
527
528 std::cerr << std::flush;
529
530 //
531 // simple ptr to fasta data -- just treat the memory map
532 // as an array of fastaDataSize characters...
533 //
534 const char *fasta = (const char *) fastaFile.data;
535 size_t fastaDataSize = fastaFile.length();
536
537 uint32_t chromosomeCount = 0;
538 uint64_t baseCount = 0;
539 getFastaStats(fasta, fastaDataSize, chromosomeCount, baseCount);
540
541 if (genomeSequenceArray::create(_umfaFilename.c_str(), baseCount, chromosomeCount))
542 {
543 std::cerr << "failed to create '"
544 << _umfaFilename
545 << "'."
546 << std::endl;
547 perror("");
548 return true;
549 }
550 header->elementCount = 0;
551 header->_colorSpace = isColorSpace();
552 header->setApplication(_application.c_str());
553 header->_chromosomeCount = chromosomeCount;
554 //
555 // clear out the variable length chromosome info array
556 //
557 for (uint32_t i=0; i<header->_chromosomeCount; i++) header->_chromosomes[i].constructorClear();
558
559 std::string chromosomeName;
560
561 //
562 // for converting the reference to colorspace, the first base is always 5 (in base space it is 'N')
563 signed char lastBase = BaseAsciiMap::base2int[(int) 'N'];
564 bool terminateLoad = false;
565 int percent = -1, newPercent;
566 uint32_t whichChromosome = 0;
567 for (uint64_t fastaIndex = 0; fastaIndex < fastaDataSize; fastaIndex++)
568 {
569 if (_progressStream)
570 {
571 newPercent = (int) (1.0 * fastaIndex / fastaDataSize) * 100;
572 if (newPercent>percent)
573 {
574 *_progressStream << "\r" << newPercent << "% ";
575 *_progressStream << std::flush;
576 percent = newPercent;
577 }
578 }
579 switch (fasta[fastaIndex])
580 {
581 case '\n':
582 case '\r':
583 break;
584 case '>':
585 {
586 chromosomeName = "";
587 fastaIndex++; // skip the > char
588 //
589 // pull out the chromosome new name
590 //
591 while (!isspace(fasta[fastaIndex]))
592 {
593 chromosomeName += fasta[fastaIndex++]; // slow, but who cares
594 }
595 //
596 // eat the rest of the line
597 //
598 while (fasta[fastaIndex]!='\n' && fasta[fastaIndex]!='\r')
599 {
600 fastaIndex++;
601 }
602 //
603 // save the Chromosome name and index into our
604 // header so we can use them later.
605 //
606 ChromosomeInfo *c = &header->_chromosomes[whichChromosome];
607 c->setChromosomeName(chromosomeName.c_str());
608 c->start = header->elementCount;
609 // c->size gets computed at the next '>' line or at the EOF
610
611 if (whichChromosome>0)
612 {
613 //
614 // compute md5 checksum for the chromosome that we just
615 // loaded (if there was one) - note that on the last
616 // chromosome, we have to duplicate this code after
617 // the end of this loop
618 //
619 setChromosomeMD5andLength(whichChromosome - 1);
620 }
621 whichChromosome++;
622 if (whichChromosome > header->_chromosomeCount)
623 {
624 std::cerr << "BUG: Exceeded computed chromosome count ("
625 << header->_chromosomeCount
626 << ") - genome is now truncated at chromosome "
627 << header->_chromosomes[header->_chromosomeCount-1].name
628 << " (index "
629 << header->_chromosomeCount
630 << ")."
631 << std::endl;
632 terminateLoad = true;
633 }
634 break;
635 }
636 default:
637 // save the base pair value
638 // Note: invalid characters come here as well, but we
639 // let ::set deal with mapping them.
640 if (isColorSpace())
641 {
642//
643// anything outside these values represents an invalid base
644// base codes: 0-> A, 1-> C, 2-> G, 3-> T
645// colorspace: 0-> blue, 1-> green, 2-> oragne, 3->red
646//
647 const char fromBase2CS[] =
648 {
649 /* 0000 */ 0, // A->A
650 /* 0001 */ 1, // A->C
651 /* 0010 */ 2, // A->G
652 /* 0011 */ 3, // A->T
653 /* 0100 */ 1, // C->A
654 /* 0101 */ 0, // C->C
655 /* 0110 */ 3, // C->G
656 /* 0111 */ 2, // C->T
657 /* 1000 */ 2, // G->A
658 /* 1001 */ 3, // G->C
659 /* 1010 */ 0, // G->G
660 /* 1011 */ 1, // G->T
661 /* 1100 */ 3, // T->A
662 /* 1101 */ 2, // T->C
663 /* 1110 */ 1, // T->G
664 /* 1111 */ 0, // T->T
665 };
666 //
667 // we are writing color space values on transitions,
668 // so we don't write a colorspace value when we
669 // get the first base value.
670 //
671 // On second and subsequent bases, write based on
672 // the index table above
673 //
674 char thisBase =
675 BaseAsciiMap::base2int[(int)(fasta[fastaIndex])];
676 if (lastBase>=0)
677 {
678 char color;
679 if (lastBase>3 || thisBase>3) color=4;
680 else color = fromBase2CS[(int)(lastBase<<2 | thisBase)];
681 // re-use the int to base, because ::set expects a base char (ATCG), not
682 // a color code (0123). It should only matter on final output.
683 set(header->elementCount++,
684 BaseAsciiMap::int2base[(int) color]);
685 }
686 lastBase = thisBase;
687 }
688 else
689 {
690 set(header->elementCount++, toupper(fasta[fastaIndex]));
691 }
692 break;
693 }
694
695 //
696 // slightly awkward exit handling when we exceed the fixed
697 // number of chromosomes
698 //
699 if (terminateLoad) break;
700 }
701
702 //
703 // also slightly awkward code to handle the last dangling chromosome...
704 // all we should need to do is compute the md5 checksum
705 //
706 if (whichChromosome==0)
707 {
708 fastaFile.close();
709 throw std::runtime_error("No chromosomes found - aborting!");
710 }
711 else
712 {
713 setChromosomeMD5andLength(whichChromosome-1);
714 }
715
716 fastaFile.close();
717
718 if (_progressStream) *_progressStream << "\r";
719
720 std::cerr << "FASTA binary cache file '"
721 << _umfaFilename
722 << "' created."
723 << std::endl;
724
725 //
726 // leave the umfastaFile open in case caller wants to use it
727 //
728 return false;
729}
static unsigned char base2int[256+1]
Map ASCII values to a 2 (or 3) bit encoding for the base pair value for just base space (ACTGNactgn).
static const char int2base[]
Convert from int representation to the base.
bool isColorSpace() const
tell us if we are a color space reference or not
int create(const char *file, indexT elementCount, int optionalHeaderCount=0)
Create a vector with elementCount memebers.
There are a pair of related data structures in the operating system, and also a few simple algorithms...
Definition MemoryMap.h:156
virtual bool open(const char *file, int flags=O_RDONLY)
open a previously created mapped vector

◆ debugPrintReadValidation()

int GenomeSequence::debugPrintReadValidation ( std::string &  read,
std::string &  quality,
char  direction,
genomeIndex_t  readLocation,
int  sumQuality,
int  mismatchCount,
bool  recurse = true 
)

Definition at line 855 of file GenomeSequence.cpp.

864{
865 int validateSumQ = 0;
866 int validateMismatchCount = 0;
867 int rc = 0;
868 std::string genomeData;
869
870 for (uint32_t i=0; i<read.size(); i++)
871 {
872 if (tolower(read[i]) != tolower((*this)[readLocation + i]))
873 {
874 validateSumQ += quality[i] - '!';
875 // XXX no longer valid:
876 if (direction=='F' ? i<24 : (i >= (read.size() - 24))) validateMismatchCount++;
877 genomeData.push_back(tolower((*this)[readLocation + i]));
878 }
879 else
880 {
881 genomeData.push_back(toupper((*this)[readLocation + i]));
882 }
883 }
884 assert(validateSumQ>=0);
885 if (validateSumQ != sumQuality && validateMismatchCount == mismatchCount)
886 {
887 printf("SUMQ: Original Genome: %s test read: %s : actual sumQ = %d, test sumQ = %d\n",
888 genomeData.c_str(),
889 read.c_str(),
890 validateSumQ,
891 sumQuality
892 );
893 rc++;
894 }
895 else if (validateSumQ == sumQuality && validateMismatchCount != mismatchCount)
896 {
897 printf("MISM: Original Genome: %s test read: %s : actual mismatch %d test mismatches %d\n",
898 genomeData.c_str(),
899 read.c_str(),
900 validateMismatchCount,
901 mismatchCount
902 );
903 rc++;
904 }
905 else if (validateSumQ != sumQuality && validateMismatchCount != mismatchCount)
906 {
907 printf("BOTH: Original Genome: %s test read: %s : actual sumQ = %d, test sumQ = %d, actual mismatch %d test mismatches %d\n",
908 genomeData.c_str(),
909 read.c_str(),
910 validateSumQ,
911 sumQuality,
912 validateMismatchCount,
913 mismatchCount
914 );
915 rc++;
916 }
917
918 if (recurse && ABS(validateMismatchCount - mismatchCount) > (int) read.size()/2)
919 {
920 printf("large mismatch difference, trying reverse strand: ");
921 std::string reverseRead = read;
922 std::string reverseQuality = quality;
923 getReverseRead(reverseRead);
924 reverse(reverseQuality.begin(), reverseQuality.end());
925 rc = debugPrintReadValidation(reverseRead, reverseQuality, readLocation, sumQuality, mismatchCount, false);
926 }
927 return rc;
928}

◆ dumpHeaderTSV()

void GenomeSequence::dumpHeaderTSV ( std::ostream &  file) const

Definition at line 976 of file GenomeSequence.cpp.

977{
978 file << "# Reference: " << _baseFilename << std::endl;
979 file << "# SN: sample name - must be unique" << std::endl;
980 file << "# AS: assembly name" << std::endl;
981 file << "# SP: species" << std::endl;
982 file << "# LN: chromosome/contig length" << std::endl;
983 file << "# M5: chromosome/contig MD5 checksum" << std::endl;
984 file << "# LN and M5 are only printed for informational purposes." << std::endl;
985 file << "# Karma will only set those values when creating the index." << std::endl;
986 file << "SN" << "\t" << "AS" << "\t" << "SP" << "\t" << "UR" << "\t" << "LN" << "\t" << "M5" << std::endl;
987 for (unsigned int i=0; i<header->_chromosomeCount; i++)
988 {
989 file
990 << header->_chromosomes[i].name // name
991 << "\t" << header->_chromosomes[i].assemblyID // e.g. NCBI36.3
992 << "\t" << header->_chromosomes[i].uri
993 << "\t" << header->_chromosomes[i].species // e.g. Homo_sapiens
994 << "\t" << header->_chromosomes[i].size // number of bases
995 << "\t" << header->_chromosomes[i].md5
996 << std::endl;
997 }
998}

◆ dumpSequenceSAMDictionary()

void GenomeSequence::dumpSequenceSAMDictionary ( std::ostream &  file) const

Definition at line 960 of file GenomeSequence.cpp.

961{
962 for (unsigned int i=0; i<header->_chromosomeCount; i++)
963 {
964 file
965 << "@SQ"
966 << "\tSN:" << header->_chromosomes[i].name // name
967 << "\tLN:" << header->_chromosomes[i].size // number of bases
968 << "\tAS:" << header->_chromosomes[i].assemblyID // e.g. NCBI36.3
969 << "\tM5:" << header->_chromosomes[i].md5
970 << "\tUR:" << header->_chromosomes[i].uri
971 << "\tSP:" << header->_chromosomes[i].species // e.g. Homo_sapiens
972 << std::endl;
973 }
974}

◆ getBase()

char GenomeSequence::getBase ( const char *  chromosomeName,
unsigned int  chromosomeIndex 
) const
inline

given a chromosome name and 1-based position, return the reference base.

Parameters
chromosomeNamename of the chromosome - exact match only
chromosomeIndex1-based chromosome position
Returns
reference base at the above chromosome position

Definition at line 388 of file GenomeSequence.h.

390 {
391 genomeIndex_t index =
392 getGenomePosition(chromosomeName, chromosomeIndex);
393 if(index == INVALID_GENOME_INDEX)
394 {
395 // Invalid position, so return 'N'
396 return('N');
397 }
398 return((*this)[index]);
399 }
genomeIndex_t getGenomePosition(const char *chromosomeName, unsigned int chromosomeIndex) const
given a chromosome name and position, return the genome position

References getGenomePosition().

Referenced by PileupElement::getRefBase().

◆ getBaseFilename()

const std::string & GenomeSequence::getBaseFilename ( ) const
inline

Definition at line 285 of file GenomeSequence.h.

286 {
287 return _baseFilename;
288 }

◆ getChromosome() [1/2]

int GenomeSequence::getChromosome ( const char *  chromosomeName) const

given a chromosome name, return the chromosome index

This is done via a linear search of the chromosome table in the header of the mapped file, so it is O(N)

Parameters
chromosomeNamethe name of the chromosome - exact match only
Returns
0-based index into chromosome table - INVALID_CHROMOSOME_INDEX if error

Definition at line 814 of file GenomeSequence.cpp.

815{
816 unsigned int i;
817 for (i=0; i<header->_chromosomeCount; i++)
818 {
819 if (strcmp(header->_chromosomes[i].name, chromosomeName)==0)
820 {
821 return i;
822 }
823 }
824 return INVALID_CHROMOSOME_INDEX;
825}

◆ getChromosome() [2/2]

int GenomeSequence::getChromosome ( genomeIndex_t  position) const

given a whole genome index, get the chromosome it is located in

This is done via a binary search of the chromosome table in the header of the mapped file, so it is O(log(N))

Parameters
0-basedposition the base in the genome
Returns
0-based index into chromosome table - INVALID_CHROMOSOME_INDEX if error

Definition at line 737 of file GenomeSequence.cpp.

738{
739 if (position == INVALID_GENOME_INDEX) return INVALID_CHROMOSOME_INDEX;
740
741 if (header->_chromosomeCount == 0)
742 return INVALID_CHROMOSOME_INDEX;
743
744 int start = 0;
745 int stop = header->_chromosomeCount - 1;
746
747 // eliminate case where position is in the last chromosome, since the loop
748 // below falls off the end of the list if it in the last one.
749
750 if (position > header->_chromosomes[stop].start)
751 return (stop);
752
753 while (start <= stop)
754 {
755 int middle = (start + stop) / 2;
756
757 if (position >= header->_chromosomes[middle].start && position < header->_chromosomes[middle + 1].start)
758 return middle;
759
760 if (position == header->_chromosomes[middle + 1].start)
761 return (middle + 1);
762
763 if (position > header->_chromosomes[middle + 1].start)
764 start = middle + 1;
765
766 if (position < header->_chromosomes[middle].start)
767 stop = middle - 1;
768 }
769
770 return -1;
771}

Referenced by getGenomePosition().

◆ getChromosomeAndIndex() [1/2]

void GenomeSequence::getChromosomeAndIndex ( std::string &  s,
genomeIndex_t  i 
) const

Definition at line 1165 of file GenomeSequence.cpp.

1166{
1167 int whichChromosome = 0;
1168
1169 whichChromosome = getChromosome(i);
1170
1171 if (whichChromosome == INVALID_CHROMOSOME_INDEX)
1172 {
1173 s = "invalid genome index"; // TODO include the index in error
1174 }
1175 else
1176 {
1177 std::ostringstream buf;
1178 genomeIndex_t chromosomeIndex = i - getChromosomeStart(whichChromosome) + 1;
1179 buf << header->_chromosomes[whichChromosome].name << ":" << chromosomeIndex;
1180#if 0
1181 buf << " (GenomeIndex " << i << ")";
1182#endif
1183 s = buf.str();
1184 }
1185 return;
1186}
genomeIndex_t getChromosomeStart(int chromosomeIndex) const
given a chromosome, return the genome base it starts in
int getChromosome(genomeIndex_t position) const
given a whole genome index, get the chromosome it is located in

◆ getChromosomeAndIndex() [2/2]

void GenomeSequence::getChromosomeAndIndex ( String s,
genomeIndex_t  i 
) const

Definition at line 1189 of file GenomeSequence.cpp.

1190{
1191 std::string ss;
1192 getChromosomeAndIndex(ss, i);
1193 s = ss.c_str();
1194 return;
1195}

◆ getChromosomeCount()

int GenomeSequence::getChromosomeCount ( ) const

Return the number of chromosomes in the genome.

Returns
number of chromosomes in the genome

Definition at line 731 of file GenomeSequence.cpp.

732{
733 return header->_chromosomeCount;
734}

◆ getChromosomeName()

const char * GenomeSequence::getChromosomeName ( int  chromosomeIndex) const
inline

Definition at line 290 of file GenomeSequence.h.

291 {
292 return header->_chromosomes[chromosomeIndex].name;
293 }

◆ getChromosomeSize()

genomeIndex_t GenomeSequence::getChromosomeSize ( int  chromosomeIndex) const
inline

given a chromosome, return its size in bases

Parameters
0-basedchromosome index
Returns
size of the chromosome in bases

Definition at line 256 of file GenomeSequence.h.

257 {
258 if (chromosomeIndex==INVALID_CHROMOSOME_INDEX) return 0;
259 return header->_chromosomes[chromosomeIndex].size;
260 }

◆ getChromosomeStart()

genomeIndex_t GenomeSequence::getChromosomeStart ( int  chromosomeIndex) const
inline

given a chromosome, return the genome base it starts in

Parameters
0-basedchromosome index
Returns
0-based genome index of the base that starts the chromosome

Definition at line 246 of file GenomeSequence.h.

247 {
248 if (chromosomeIndex==INVALID_CHROMOSOME_INDEX) return INVALID_GENOME_INDEX;
249 return header->_chromosomes[chromosomeIndex].start;
250 }

◆ getDataPtr()

uint8_t * GenomeSequence::getDataPtr ( genomeIndex_t  index)
inline

obtain the pointer to the raw data for other access methods

this is a fairly ugly hack to reach into the raw genome vector, get the byte that encodes two bases, and return it. This is used by karma ReadIndexer::getSumQ to compare genome matchines by byte (two bases at a time) to speed it up.

Definition at line 422 of file GenomeSequence.h.

423 {
424 return ((uint8_t *) data + index/2);
425 }

◆ getFastaName()

const std::string & GenomeSequence::getFastaName ( ) const
inline

Definition at line 198 of file GenomeSequence.h.

199 {
200 return _fastaFilename;
201 }

◆ getGenomePosition() [1/3]

genomeIndex_t GenomeSequence::getGenomePosition ( const char *  chromosomeName) const

given the chromosome name, get the corresponding 0 based genome index for the start of that chromosome

Definition at line 807 of file GenomeSequence.cpp.

808{
809 int chromosome = getChromosome(chromosomeName);
810 if (chromosome==INVALID_CHROMOSOME_INDEX) return INVALID_GENOME_INDEX;
811 return header->_chromosomes[chromosome].start;
812}

References getChromosome().

◆ getGenomePosition() [2/3]

genomeIndex_t GenomeSequence::getGenomePosition ( const char *  chromosomeName,
unsigned int  chromosomeIndex 
) const

given a chromosome name and position, return the genome position

Parameters
chromosomeNamename of the chromosome - exact match only
chromosomeIndex1-based chromosome position
Returns
genome index of the above chromosome position

Definition at line 779 of file GenomeSequence.cpp.

782{
783 genomeIndex_t i = getGenomePosition(chromosomeName);
784 if (i == INVALID_GENOME_INDEX) return INVALID_GENOME_INDEX;
785 return i + chromosomeIndex - 1;
786}

References getGenomePosition().

Referenced by SamTags::createMDTag(), getBase(), getGenomePosition(), SamQuerySeqWithRefIter::reset(), SamQuerySeqWithRef::seqWithEquals(), and SamQuerySeqWithRef::seqWithoutEquals().

◆ getGenomePosition() [3/3]

genomeIndex_t GenomeSequence::getGenomePosition ( int  chromosome,
unsigned int  chromosomeIndex 
) const

given a chromosome index and position, return the genome position

Parameters
chromosomeindex of the chromosome
chromosomeIndex1-based chromosome position
Returns
genome index of the above chromosome position

Definition at line 788 of file GenomeSequence.cpp.

791{
792 if (chromosome<0 || chromosome >= (int) header->_chromosomeCount) return INVALID_GENOME_INDEX;
793
794 genomeIndex_t i = header->_chromosomes[chromosome].start;
795 if (i == INVALID_GENOME_INDEX) return INVALID_GENOME_INDEX;
796 return i + chromosomeIndex - 1;
797}

◆ getHighLightedString()

void GenomeSequence::getHighLightedString ( std::string &  str,
genomeIndex_t  index,
int  baseCount,
genomeIndex_t  highLightStart,
genomeIndex_t  highLightEnd 
) const

Definition at line 1046 of file GenomeSequence.cpp.

1047{
1048 str.clear();
1049 if (baseCount > 0)
1050 {
1051 for (int i=0; i<baseCount; i++)
1052 {
1053 char base = (*this)[index + i];
1054 if (in(index+i, highLightStart, highLightEnd))
1055 base = tolower(base);
1056 str.push_back(base);
1057 }
1058 }
1059 else
1060 {
1061 // if caller passed negative basecount, give them
1062 // the read for the 3' end
1063 for (int i=0; i< -baseCount; i++)
1064 {
1065 char base = BaseAsciiMap::base2complement[(int)(*this)[index + i]];
1066 if (in(index+i, highLightStart, highLightEnd))
1067 base = tolower(base);
1068 str.push_back(base);
1069 }
1070 }
1071}

◆ getInteger()

uint8_t GenomeSequence::getInteger ( genomeIndex_t  index) const
inline

Definition at line 402 of file GenomeSequence.h.

403 {
404 return (*((genomeSequenceArray*) this))[index];
405 }

◆ getMismatchCount()

int GenomeSequence::getMismatchCount ( std::string &  read,
genomeIndex_t  location,
char  exclude = '\0' 
) const
inline

Return the mismatch count, disregarding CIGAR strings.

Parameters
readis the sequence we're counting mismatches in
locationis where in the genmoe we start comparing
excludeis a wildcard character (e.g. '.' or 'N')
Returns
number of bases that don't match the reference, except those that match exclude

Definition at line 488 of file GenomeSequence.h.

489 {
490 int mismatchCount = 0;
491 for (uint32_t i=0; i<read.size(); i++)
492 if (read[i]!=exclude) mismatchCount += read[i]!=(*this)[location + i];
493 return mismatchCount;
494 };

◆ getMismatchHatString()

void GenomeSequence::getMismatchHatString ( std::string &  result,
const std::string &  read,
genomeIndex_t  location 
) const

Definition at line 1085 of file GenomeSequence.cpp.

1086{
1087 result = "";
1088 for (uint32_t i=0; i < read.size(); i++)
1089 {
1090 if (read[i] == (*this)[location+i])
1091 result.push_back(' ');
1092 else
1093 result.push_back('^');
1094 }
1095}

◆ getMismatchString()

void GenomeSequence::getMismatchString ( std::string &  result,
const std::string &  read,
genomeIndex_t  location 
) const

Definition at line 1097 of file GenomeSequence.cpp.

1098{
1099 result = "";
1100 for (uint32_t i=0; i < read.size(); i++)
1101 {
1102 if (read[i] == (*this)[location+i])
1103 result.push_back(toupper(read[i]));
1104 else
1105 result.push_back(tolower(read[i]));
1106 }
1107}

◆ getNumberBases()

genomeIndex_t GenomeSequence::getNumberBases ( ) const
inline

return the number of bases represented in this reference

Returns
count of bases

Definition at line 216 of file GenomeSequence.h.

217 {
218 return getElementCount();
219 }

Referenced by loadDBSNP(), and operator[]().

◆ getReferenceName()

const std::string & GenomeSequence::getReferenceName ( ) const
inline

Definition at line 202 of file GenomeSequence.h.

203 {
204 return _referenceFilename;
205 }

◆ getReverseRead() [1/2]

void GenomeSequence::getReverseRead ( std::string &  read)

Definition at line 831 of file GenomeSequence.cpp.

832{
833 std::string newRead;
834 if (read.size()) for (int32_t i=(int) read.size() - 1; i>=0; i--)
835 {
836 newRead.push_back(BasePair(read[i]));
837 }
838 read = newRead;
839}

◆ getReverseRead() [2/2]

void GenomeSequence::getReverseRead ( String read)

Definition at line 841 of file GenomeSequence.cpp.

842{
843 int i = 0;
844 int j = read.Length()-1;
845 char temp;
846 while (i < j)
847 {
848 temp = read[j];
849 read[j] = read[i];
850 read[i] = temp;
851 }
852}

◆ getString() [1/4]

void GenomeSequence::getString ( std::string &  str,
genomeIndex_t  index,
int  baseCount 
) const

Definition at line 1018 of file GenomeSequence.cpp.

1019{
1020 str.clear();
1021 if (baseCount > 0)
1022 {
1023 for (int i=0; i<baseCount; i++)
1024 {
1025 str.push_back((*this)[index + i]);
1026 }
1027 }
1028 else
1029 {
1030 // if caller passed negative basecount, give them
1031 // the read for the 3' end
1032 for (int i=0; i< -baseCount; i++)
1033 {
1034 str.push_back(BaseAsciiMap::base2complement[(int)(*this)[index + i]]);
1035 }
1036 }
1037}

◆ getString() [2/4]

void GenomeSequence::getString ( std::string &  str,
int  chromosome,
uint32_t  index,
int  baseCount 
) const

Definition at line 1001 of file GenomeSequence.cpp.

1002{
1003 //
1004 // calculate the genome index for the lazy caller...
1005 //
1006 genomeIndex_t genomeIndex = header->_chromosomes[chromosome].start + index - 1;
1007
1008 getString(str, genomeIndex, baseCount);
1009}

◆ getString() [3/4]

void GenomeSequence::getString ( String str,
genomeIndex_t  index,
int  baseCount 
) const

Definition at line 1039 of file GenomeSequence.cpp.

1040{
1041 std::string string;
1042 getString(string, index, baseCount);
1043 str = string.c_str();
1044}

◆ getString() [4/4]

void GenomeSequence::getString ( String str,
int  chromosome,
uint32_t  index,
int  baseCount 
) const

Definition at line 1011 of file GenomeSequence.cpp.

1012{
1013 std::string string;
1014 this-> getString(string, chromosome, index, baseCount);
1015 str = string.c_str();
1016}

◆ getSumQ()

int GenomeSequence::getSumQ ( std::string &  read,
std::string &  qualities,
genomeIndex_t  location 
) const
inline

brute force sumQ - no sanity checking

Parameters
readshotgun sequencer read string
qualitiesphred quality string of same length
locationthe alignment location to check sumQ

Definition at line 501 of file GenomeSequence.h.

502 {
503 int sumQ = 0;
504 for (uint32_t i=0; i<read.size(); i++)
505 sumQ += (read[i]!=(*this)[location + i] ? (qualities[i]-33) : 0);
506 return sumQ;
507 };

◆ IntegerToSeq()

std::string GenomeSequence::IntegerToSeq ( unsigned int  n,
unsigned int  wordsize 
) const

Definition at line 118 of file GenomeSequence.cpp.

119{
120 std::string sequence("");
121 for (unsigned int i = 0; i < wordsize; i ++)
122 sequence += "N";
123
124 unsigned int clearHigherBits = ~(3U << (wordsize<<1)); // XXX refactor - this appears several places
125
126 if (n > clearHigherBits)
127 error("%d needs to be a non-negative integer < clearHigherBits\n", n);
128
129 for (unsigned int i = 0; i < wordsize; i++)
130 {
131 sequence[wordsize-1-i] = BaseAsciiMap::int2base[n & 3];
132 n >>= 2;
133 }
134 return sequence;
135}

◆ isColorSpace()

bool GenomeSequence::isColorSpace ( ) const
inline

tell us if we are a color space reference or not

Returns
true if colorspace, false otherwise

Definition at line 209 of file GenomeSequence.h.

210 {
211 return _colorSpace;
212 }

Referenced by open(), and operator[]().

◆ loadDBSNP()

bool GenomeSequence::loadDBSNP ( mmapArrayBool_t dbSNP,
const char *  inputFileName 
) const

user friendly dbSNP loader.

Parameters
inputFileNamemay be empty, point to a text file or a dbSNP vector file

In all cases, dbSNP is returned the same length as this genome.

When no SNPs are loaded, all values are false.

When a text file is given, the file is parsed with two space separated columns - the first column is the chromosome name, and the second is the 1-based chromosome position of the SNP.

Returns
false if a dbSNP file was correctly loaded, true otherwise

Definition at line 1301 of file GenomeSequence.cpp.

1304{
1305 //
1306 // the goal in this section of code is to allow the user
1307 // to either specify a valid binary version of the SNP file,
1308 // or the original text file that it gets created from.
1309 //
1310 // To do this, we basically open, sniff the error message,
1311 // and if it claims it is not a binary version of the file,
1312 // we go ahead and treat it as the text file and use the
1313 // GenomeSequence::populateDBSNP method to load it.
1314 //
1315 // Further checking is really needed to ensure users don't
1316 // mix a dbSNP file for a different reference, since it is really
1317 // easy to do.
1318 //
1319 if (strlen(inputFileName)!=0)
1320 {
1321 std::cerr << "Load dbSNP file '" << inputFileName << "': " << std::flush;
1322
1323 if (dbSNP.open(inputFileName, O_RDONLY))
1324 {
1325 //
1326 // failed to open, possibly due to bad magic.
1327 //
1328 // this is really awful ... need to have a return
1329 // code that is smart enough to avoid this ugliness:
1330 //
1331 if (dbSNP.getErrorString().find("wrong type of file")==std::string::npos)
1332 {
1333 std::cerr << "Error: " << dbSNP.getErrorString() << std::endl;
1334 exit(1);
1335 }
1336 //
1337 // we have a file, assume we can load it as a text file
1338 //
1339 IFILE inputFile = ifopen(inputFileName, "r");
1340 if(inputFile == NULL)
1341 {
1342 std::cerr << "Error: failed to open " << inputFileName << std::endl;
1343 exit(1);
1344 }
1345
1346 std::cerr << "(as text file) ";
1347
1348 // anonymously (RAM resident only) create:
1349 dbSNP.create(getNumberBases());
1350
1351 // now load it into RAM
1352 populateDBSNP(dbSNP, inputFile);
1353 ifclose(inputFile);
1354
1355 }
1356 else
1357 {
1358 std::cerr << "(as binary mapped file) ";
1359 }
1360
1361 std::cerr << "DONE!" << std::endl;
1362 return false;
1363 }
1364 else
1365 {
1366 return true;
1367 }
1368}
IFILE ifopen(const char *filename, const char *mode, InputFile::ifileCompression compressionMode=InputFile::DEFAULT)
Open a file with the specified name and mode, using a filename of "-" to indicate stdin/stdout.
Definition InputFile.h:562
int ifclose(IFILE &file)
Close the file.
Definition InputFile.h:580
genomeIndex_t getNumberBases() const
return the number of bases represented in this reference
Class for easily reading/writing files without having to worry about file type (uncompressed,...
Definition InputFile.h:37
bool open(const char *file, int flags=O_RDONLY)
open a previously created mapped vector

References MemoryMapArray< elementT, indexT, cookieVal, versionVal, accessorFunc, setterFunc, elementCount2BytesFunc, arrayHeaderClass >::create(), getNumberBases(), ifclose(), ifopen(), and MemoryMapArray< elementT, indexT, cookieVal, versionVal, accessorFunc, setterFunc, elementCount2BytesFunc, arrayHeaderClass >::open().

◆ open() [1/2]

bool GenomeSequence::open ( bool  isColorSpace = false,
int  flags = O_RDONLY 
)

open the reference specified using GenomeSequence::setReferenceName

Parameters
isColorSpaceopen the color space reference
flagspass through to the open() call (O_RDWR lets you modify the contents)
Returns
false for success, true otherwise

Definition at line 182 of file GenomeSequence.cpp.

183{
184 bool rc;
185
186 if (isColorSpace)
187 {
188 _umfaFilename = _baseFilename + "-cs.umfa";
189 }
190 else
191 {
192 _umfaFilename = _baseFilename + "-bs.umfa";
193 }
194
195 if(access(_umfaFilename.c_str(), R_OK) != 0)
196 {
197 // umfa file doesn't exist, so try to create it.
198 if(create(isColorSpace))
199 {
200 // Couldon't access or create the umfa.
201 std::cerr << "GenomeSequence::open: failed to open file "
202 << _umfaFilename
203 << " also failed creating it."
204 << std::endl;
205 return true;
206 }
207 }
208
209 rc = genomeSequenceArray::open(_umfaFilename.c_str(), flags);
210 if (rc)
211 {
212 std::cerr << "GenomeSequence::open: failed to open file "
213 << _umfaFilename
214 << std::endl;
215 return true;
216 }
217
218 _colorSpace = header->_colorSpace;
219
220 return false;
221}

References isColorSpace(), and MemoryMapArray< elementT, indexT, cookieVal, versionVal, accessorFunc, setterFunc, elementCount2BytesFunc, arrayHeaderClass >::open().

◆ open() [2/2]

bool GenomeSequence::open ( const char *  filename,
int  flags = O_RDONLY 
)
inlinevirtual

open the given file as the genome (no filename munging occurs).

Parameters
filenamethe name of the file to open
flagspass through to the open() call (O_RDWR lets you modify the contents)
Returns
false for success, true otherwise

Reimplemented from MemoryMap.

Definition at line 159 of file GenomeSequence.h.

160 {
161 _umfaFilename = filename;
162 // TODO - should this method be doing something???
163 return false;
164 }

◆ operator[]()

char GenomeSequence::operator[] ( genomeIndex_t  index) const
inline

Return the bases in base space or color space for within range index, ot.

Parameters
indexthe array-like index (0 based).
Returns
ACTGN in base space; 0123N for color space; and 'N' for invalid. For color space, index i represents the transition of base at position (i-1) to base at position i

NB: bounds checking here needs to be deprecated - do not assume it will exist - the call must clip reads so that this routine is never called with a index value larger than the genome.

The reason for this is simply that this routine gets called hundreds of billions of time in one run of karma, which will absolutely kill performance. Every single instruction here matters a great, great deal.

Definition at line 361 of file GenomeSequence.h.

362 {
363 uint8_t val;
364 if (index < getNumberBases())
365 {
366 if ((index&1)==0)
367 {
368 val = ((uint8_t *) data)[index>>1] & 0xf;
369 }
370 else
371 {
372 val = (((uint8_t *) data)[index>>1] & 0xf0) >> 4;
373 }
374 }
375 else
376 {
378 }
381 return val;
382 }
static const char int2colorSpace[]
Convert from int representation to colorspace representation.
static const int baseNIndex
Value associated with 'N' in the ascii to base map (bad read).

References BaseAsciiMap::baseNIndex, getNumberBases(), BaseAsciiMap::int2base, BaseAsciiMap::int2colorSpace, and isColorSpace().

◆ populateDBSNP()

bool GenomeSequence::populateDBSNP ( mmapArrayBool_t dbSNP,
IFILE  inputFile 
) const

Definition at line 1213 of file GenomeSequence.cpp.

1216{
1217 assert(dbSNP.getElementCount() == getNumberBases());
1218
1219 if(inputFile == NULL)
1220 {
1221 // FAIL, file not opened.
1222 return(false);
1223 }
1224
1225 std::string chromosomeName;
1226 std::string position;
1227 genomeIndex_t chromosomePosition1; // 1-based
1228 uint64_t ignoredLineCount = 0;
1229
1230 // Read til the end of the file.
1231 char* postPosPtr = NULL;
1232 while(!inputFile->ifeof())
1233 {
1234 chromosomeName.clear();
1235 position.clear();
1236 // Read the chromosome
1237 if(inputFile->readTilTab(chromosomeName) <= 0)
1238 {
1239 // hit either eof or end of line, check if
1240 // it is a header.
1241 if(chromosomeName.size()>0 && chromosomeName[0]=='#')
1242 {
1243 // header, so just continue.
1244 continue;
1245 }
1246 // Not the header, so this line is poorly formatted.
1247 ++ignoredLineCount;
1248 // Continue to the next line.
1249 continue;
1250 }
1251
1252 // Check if it is a header line.
1253 if(chromosomeName.size()>0 && chromosomeName[0]=='#')
1254 {
1255 // did not hit eof or end of line,
1256 // so discard the rest of the line.
1257 inputFile->discardLine();
1258 continue;
1259 }
1260
1261 // Not a header, so read the position.
1262 if(inputFile->readTilTab(position) > 0)
1263 {
1264 // Additional data on the line, so discard it.
1265 inputFile->discardLine();
1266 }
1267
1268 // Convert the position to a string.
1269 chromosomePosition1 = strtoul(position.c_str(), &postPosPtr, 0);
1270
1271
1272 if(postPosPtr == position.c_str())
1273 {
1274 ++ignoredLineCount;
1275 continue;
1276 }
1277
1278 // 1-based genome index.
1279 genomeIndex_t genomeIndex =
1280 getGenomePosition(chromosomeName.c_str(), chromosomePosition1);
1281
1282 // if the genome index is invalid, ignore it
1283 if((genomeIndex == INVALID_GENOME_INDEX) ||
1284 (genomeIndex > getNumberBases()))
1285 {
1286 ignoredLineCount++;
1287 continue;
1288 }
1289
1290 dbSNP.set(genomeIndex, true);
1291 }
1292
1293 if (ignoredLineCount > 0)
1294 {
1295 std::cerr << "GenomeSequence::populateDBSNP: ignored " << ignoredLineCount << " SNP positions due to invalid format of line." << std::endl;
1296 return false;
1297 }
1298 return true;
1299}
int ifeof() const
Check to see if we have reached the EOF.
Definition InputFile.h:386
int discardLine()
Read until the end of the line, discarding the characters, returning -1 returned for EOF and returnin...
Definition InputFile.cpp:95
int readTilTab(std::string &field)
Read, appending the characters into the specified string until tab, new line, or EOF is found,...

◆ print30()

void GenomeSequence::print30 ( genomeIndex_t  index) const

Definition at line 1073 of file GenomeSequence.cpp.

1074{
1075 std::cout << "index: " << index << "\n";
1076 for (genomeIndex_t i=index-30; i<index+30; i++)
1077 std::cout << (*this)[i];
1078 std::cout << "\n";
1079 for (genomeIndex_t i=index-30; i<index; i++)
1080 std::cout << " ";
1081 std::cout << "^";
1082 std::cout << std::endl;
1083}

◆ printNearbyWords()

bool GenomeSequence::printNearbyWords ( unsigned int  index,
unsigned int  variance,
std::string &  word 
) const

Definition at line 941 of file GenomeSequence.cpp.

942{
943 for (unsigned int i = index - deviation; i < index + deviation; i++)
944 {
945 if (wordMatch(i, word))
946 {
947 std::cerr << "word '"
948 << word
949 << "' found "
950 << i - index
951 << " away from position "
952 << index
953 << "."
954 << std::endl;
955 }
956 }
957 return false;
958}

◆ sanityCheck()

void GenomeSequence::sanityCheck ( MemoryMap fasta) const

Definition at line 223 of file GenomeSequence.cpp.

224{
225 unsigned int i;
226
227 unsigned int genomeIndex = 0;
228 for (i=0; i<fasta.length(); i++)
229 {
230 switch (fasta[i])
231 {
232 case '>':
233 while (fasta[i]!='\n' && fasta[i]!='\r') i++;
234 break;
235 case '\n':
236 case '\r':
237 break;
238 default:
239 assert(BaseAsciiMap::base2int[(int)(*this)[genomeIndex]] ==
240 BaseAsciiMap::base2int[(int) fasta[i]]);
241 genomeIndex++;
242 break;
243 }
244 }
245}

◆ sequenceLength()

genomeIndex_t GenomeSequence::sequenceLength ( ) const
inline

Definition at line 300 of file GenomeSequence.h.

301 {
302 return (genomeIndex_t) header->elementCount;
303 }

◆ set()

void GenomeSequence::set ( genomeIndex_t  index,
char  value 
)
inline

Definition at line 407 of file GenomeSequence.h.

408 {
409 genomeSequenceArray::set(index,
410 BaseAsciiMap::base2int[(uint8_t) value]);
411 }

◆ setApplication()

void GenomeSequence::setApplication ( std::string  application)
inline

set the application name in the binary file header

Parameters
applicationname of the application

Definition at line 194 of file GenomeSequence.h.

195 {
196 _application = application; // used in ::create() to set application name
197 }

◆ setColorSpace()

void GenomeSequence::setColorSpace ( bool  colorSpace)
inline

Definition at line 176 of file GenomeSequence.h.

176{_colorSpace = colorSpace; }

◆ setCreateOverwrite()

void GenomeSequence::setCreateOverwrite ( bool  createOverwrite)
inline

Definition at line 180 of file GenomeSequence.h.

180{_createOverwrite = createOverwrite;}

◆ setDebugFlag()

void GenomeSequence::setDebugFlag ( bool  d)
inline

Definition at line 295 of file GenomeSequence.h.

296 {
297 _debugFlag = d;
298 }

◆ setProgressStream()

void GenomeSequence::setProgressStream ( std::ostream &  progressStream)
inline

if set, then show progress when creating and pre-fetching

Definition at line 175 of file GenomeSequence.h.

175{_progressStream = &progressStream;}

◆ setReferenceName()

bool GenomeSequence::setReferenceName ( std::string  referenceFilename)

set the reference name that will be used in open()

Parameters
referenceFilenamethe name of the reference fasta file to open
Returns
false for success, true otherwise
See also
open()

Definition at line 254 of file GenomeSequence.cpp.

255{
256
257 if (HAS_SUFFIX(referenceFilename, ".fa"))
258 {
259 _referenceFilename = referenceFilename;
260 _baseFilename = _referenceFilename.substr(0, referenceFilename.size() - 3);
261 }
262 else if (HAS_SUFFIX(referenceFilename, ".umfa"))
263 {
264 _baseFilename = referenceFilename.substr(0, referenceFilename.size() - 5);
265 }
266 else if (HAS_SUFFIX(referenceFilename, "-cs.umfa"))
267 {
268 _baseFilename = referenceFilename.substr(0, referenceFilename.size() - 8);
269 }
270 else if (HAS_SUFFIX(referenceFilename, "-bs.umfa"))
271 {
272 _baseFilename = referenceFilename.substr(0, referenceFilename.size() - 8);
273 }
274 else
275 {
276 _baseFilename = referenceFilename;
277 }
278 _fastaFilename = _baseFilename + ".fa";
279
280 if (HAS_SUFFIX(referenceFilename, ".fasta"))
281 {
282 _referenceFilename = referenceFilename;
283 _baseFilename = _referenceFilename.substr(0, referenceFilename.size() - 6);
284 _fastaFilename = _baseFilename + ".fasta";
285 }
286
287 return false;
288}

◆ setSearchCommonFileSuffix()

void GenomeSequence::setSearchCommonFileSuffix ( bool  searchCommonFileSuffix)
inline

Definition at line 177 of file GenomeSequence.h.

177{_searchCommonFileSuffix = searchCommonFileSuffix;}

◆ simpleLocalAligner()

genomeIndex_t GenomeSequence::simpleLocalAligner ( std::string &  read,
std::string &  quality,
genomeIndex_t  index,
int  windowSize 
) const

Definition at line 1109 of file GenomeSequence.cpp.

1110{
1111 int bestScore = 1000000; // either mismatch count or sumQ
1112 genomeIndex_t bestMatchLocation = INVALID_GENOME_INDEX;
1113 for (int i=-windowSize; i<windowSize; i++)
1114 {
1115 int newScore;
1116
1117 if (i<0 && ((uint32_t) -i) > index) continue;
1118 if (index + i + read.size() >= getNumberBases()) continue;
1119
1120 if (quality=="")
1121 {
1122 newScore = this->getMismatchCount(read, index + i);
1123 }
1124 else
1125 {
1126 newScore = this->getSumQ(read, quality, index + i);
1127 }
1128 if (newScore < bestScore)
1129 {
1130 bestScore = newScore;
1131 bestMatchLocation = index + i;
1132 }
1133 }
1134 return bestMatchLocation;
1135}
int getSumQ(std::string &read, std::string &qualities, genomeIndex_t location) const
brute force sumQ - no sanity checking
int getMismatchCount(std::string &read, genomeIndex_t location, char exclude='\0') const
Return the mismatch count, disregarding CIGAR strings.

◆ wordMatch()

bool GenomeSequence::wordMatch ( unsigned int  index,
std::string &  word 
) const

Definition at line 932 of file GenomeSequence.cpp.

933{
934 for (uint32_t i = 0; i<word.size(); i++)
935 {
936 if ((*this)[index + i] != word[i]) return false;
937 }
938 return true;
939}

The documentation for this class was generated from the following files: