libStatGen Software 1
CigarRoller.cpp
1/*
2 * Copyright (C) 2010-2011 Regents of the University of Michigan
3 *
4 * This program is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18#include <stdio.h>
19#include <stdlib.h>
20#include <ctype.h>
21#include "CigarRoller.h"
22
23////////////////////////////////////////////////////////////////////////
24//
25// Cigar Roller Class
26//
27
28
30{
31 std::vector<CigarOperator>::iterator i;
32 for (i = rhs.cigarOperations.begin(); i != rhs.cigarOperations.end(); i++)
33 {
34 (*this) += *i;
35 }
36 return *this;
37}
38
39
40//
41// Append a new operator at the end of the sequence.
42//
44{
45 // Adding to the cigar, so the query & reference indexes would be
46 // incomplete, so just clear them.
47 clearQueryAndReferenceIndexes();
48
49 if (rhs.count==0)
50 {
51 // nothing to do
52 }
53 else if (cigarOperations.empty() || cigarOperations.back() != rhs)
54 {
55 cigarOperations.push_back(rhs);
56 }
57 else
58 {
59 // last stored operation is the same as the new one, so just add it in
60 cigarOperations.back().count += rhs.count;
61 }
62 return *this;
63}
64
65
67{
68 clear();
69
70 (*this) += rhs;
71
72 return *this;
73}
74
75
76//
77void CigarRoller::Add(Operation operation, int count)
78{
79 CigarOperator rhs(operation, count);
80 (*this) += rhs;
81}
82
83
84void CigarRoller::Add(char operation, int count)
85{
86 switch (operation)
87 {
88 case 0:
89 case 'M':
90 Add(match, count);
91 break;
92 case 1:
93 case 'I':
94 Add(insert, count);
95 break;
96 case 2:
97 case 'D':
98 Add(del, count);
99 break;
100 case 3:
101 case 'N':
102 Add(skip, count);
103 break;
104 case 4:
105 case 'S':
106 Add(softClip, count);
107 break;
108 case 5:
109 case 'H':
110 Add(hardClip, count);
111 break;
112 case 6:
113 case 'P':
114 Add(pad, count);
115 break;
116 case 7:
117 case '=':
118 Add(match, count);
119 break;
120 case 8:
121 case 'X':
122 Add(match, count);
123 break;
124 default:
125 // Hmmm... what to do?
126 std::cerr << "ERROR "
127 << "(" << __FILE__ << ":" << __LINE__ <<"): "
128 << "Parsing CIGAR - invalid character found "
129 << "with parameter " << operation << " and " << count
130 << std::endl;
131 break;
132 }
133}
134
135
136void CigarRoller::Add(const char *cigarString)
137{
138 int operationCount = 0;
139 while (*cigarString)
140 {
141 if (isdigit(*cigarString))
142 {
143 char *endPtr;
144 operationCount = strtol((char *) cigarString, &endPtr, 10);
145 cigarString = endPtr;
146 }
147 else
148 {
149 Add(*cigarString, operationCount);
150 cigarString++;
151 }
152 }
153}
154
155
156bool CigarRoller::Remove(int index)
157{
158 if((index < 0) || ((unsigned int)index >= cigarOperations.size()))
159 {
160 // can't remove, out of range, return false.
161 return(false);
162 }
163 cigarOperations.erase(cigarOperations.begin() + index);
164 // Modifying the cigar, so the query & reference indexes are out of date,
165 // so clear them.
166 clearQueryAndReferenceIndexes();
167 return(true);
168}
169
170
171bool CigarRoller::IncrementCount(int index, int increment)
172{
173 if((index < 0) || ((unsigned int)index >= cigarOperations.size()))
174 {
175 // can't update, out of range, return false.
176 return(false);
177 }
178 cigarOperations[index].count += increment;
179
180 // Modifying the cigar, so the query & reference indexes are out of date,
181 // so clear them.
182 clearQueryAndReferenceIndexes();
183 return(true);
184}
185
186
187bool CigarRoller::Update(int index, Operation op, int count)
188{
189 if((index < 0) || ((unsigned int)index >= cigarOperations.size()))
190 {
191 // can't update, out of range, return false.
192 return(false);
193 }
194 cigarOperations[index].operation = op;
195 cigarOperations[index].count = count;
196
197 // Modifying the cigar, so the query & reference indexes are out of date,
198 // so clear them.
199 clearQueryAndReferenceIndexes();
200 return(true);
201}
202
203
204void CigarRoller::Set(const char *cigarString)
205{
206 clear();
207 Add(cigarString);
208}
209
210
211void CigarRoller::Set(const uint32_t* cigarBuffer, uint16_t bufferLen)
212{
213 clear();
214
215 // Parse the buffer.
216 for (int i = 0; i < bufferLen; i++)
217 {
218 int opLen = cigarBuffer[i] >> 4;
219
220 Add(cigarBuffer[i] & 0xF, opLen);
221 }
222}
223
224
225//
226// when we examine CIGAR strings, we need to know how
227// many cumulative insert and delete positions there are
228// so that we can adjust the read location appropriately.
229//
230// Here, we iterate over the vector of CIGAR operations,
231// summaring the count for each insert or delete (insert
232// increases the offset, delete decreases it).
233//
234// The use case for this is when we have a genome match
235// position based on an index word other than the first one,
236// and there is also a insert or delete between the beginning
237// of the read and the index word. We can't simply report
238// the match position without taking into account the indels,
239// otherwise we'll be off by N where N is the sum of this
240// indel count.
241//
242// DEPRECATED - do not use. There are better ways to accomplish that by using
243// read lengths, reference lengths, span of the read, etc.
245{
246 int offset = 0;
247 std::vector<CigarOperator>::iterator i;
248
249 for (i = cigarOperations.begin(); i != cigarOperations.end(); i++)
250 {
251 switch (i->operation)
252 {
253 case insert:
254 offset += i->count;
255 break;
256 case del:
257 offset -= i->count;
258 break;
259 // TODO anything for case skip:????
260 default:
261 break;
262 }
263 }
264 return offset;
265}
266
267
268//
269// Get the string reprentation of the Cigar operations in this object.
270// Caller must delete the returned value.
271//
273{
274 // NB: the exact size of the string is not important, it just needs to be guaranteed
275 // larger than the largest number of characters we could put into it.
276
277 // we do not explicitly manage memory usage, and we expect when program exits, the memory used here will be freed
278 static char *ret = NULL;
279 static unsigned int retSize = 0;
280
281 if (ret == NULL)
282 {
283 retSize = cigarOperations.size() * 12 + 1; // 12 == a magic number -> > 1 + log base 10 of MAXINT
284 ret = (char*) malloc(sizeof(char) * retSize);
285 assert(ret != NULL);
286
287 }
288 else
289 {
290 // currently, ret pointer has enough memory to use
291 if (retSize > cigarOperations.size() * 12 + 1)
292 {
293 }
294 else
295 {
296 retSize = cigarOperations.size() * 12 + 1;
297 free(ret);
298 ret = (char*) malloc(sizeof(char) * retSize);
299 }
300 assert(ret != NULL);
301 }
302
303 char *ptr = ret;
304 char buf[12]; // > 1 + log base 10 of MAXINT
305
306 std::vector<CigarOperator>::iterator i;
307
308 // Progressively append the character representations of the operations to
309 // the cigar string we allocated above.
310
311 *ptr = '\0'; // clear result string
312 for (i = cigarOperations.begin(); i != cigarOperations.end(); i++)
313 {
314 sprintf(buf, "%d%c", (*i).count, (*i).getChar());
315 strcat(ptr, buf);
316 while (*ptr)
317 {
318 ptr++; // limit the cost of strcat above
319 }
320 }
321 return ret;
322}
323
324
326{
327 // Clearing the cigar, so the query & reference indexes are out of
328 // date, so clear them.
329 clearQueryAndReferenceIndexes();
330 cigarOperations.clear();
331}
The purpose of this class is to provide accessors for setting, updating, modifying the CIGAR object....
Definition: CigarRoller.h:67
CigarRoller & operator+=(CigarRoller &rhs)
Add the contents of the specified CigarRoller to this object.
Definition: CigarRoller.cpp:29
bool Remove(int index)
Remove the operation at the specified index.
bool IncrementCount(int index, int increment)
Increments the count for the operation at the specified index by the specified value,...
void Add(Operation operation, int count)
Append the specified operation with the specified count to this object.
Definition: CigarRoller.cpp:77
bool Update(int index, Operation op, int count)
Updates the operation at the specified index to be the specified operation and have the specified cou...
void clear()
Clear this object so that it has no Cigar Operations.
const char * getString()
Get the string reprentation of the Cigar operations in this object, caller must delete the returned v...
void Set(const char *cigarString)
Sets this object to the specified cigarString.
CigarRoller & operator=(CigarRoller &rhs)
Set this object to be equal to the specified CigarRoller.
Definition: CigarRoller.cpp:66
int getMatchPositionOffset()
DEPRECATED - do not use, there are better ways to accomplish that by using read lengths,...
Operation
Enum for the cigar operations.
Definition: Cigar.h:87
@ del
deletion from the reference (the reference contains bases that have no corresponding base in the quer...
Definition: Cigar.h:92
@ hardClip
Hard clip on the read (clipped sequence not present in the query sequence or reference)....
Definition: Cigar.h:95
@ match
match/mismatch operation. Associated with CIGAR Operation "M"
Definition: Cigar.h:89
@ pad
Padding (not in reference or query). Associated with CIGAR Operation "P".
Definition: Cigar.h:96
@ insert
insertion to the reference (the query sequence contains bases that have no corresponding base in the ...
Definition: Cigar.h:91
@ skip
skipped region from the reference (the reference contains bases that have no corresponding base in th...
Definition: Cigar.h:93
@ softClip
Soft clip on the read (clipped sequence present in the query sequence, but not in reference)....
Definition: Cigar.h:94