libStatGen Software 1
PedigreeLoader.cpp
1/*
2 * Copyright (C) 2010 Regents of the University of Michigan
3 *
4 * This program is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18#include "Pedigree.h"
19#include "FortranFormat.h"
20#include "Error.h"
21
22#include <stdlib.h>
23#include <ctype.h>
24#include <string.h>
25
26void Pedigree::Prepare(IFILE & input)
27{
28 pd.Load(input);
29}
30
31void Pedigree::Load(IFILE & input)
32{
33 if (pd.mendelFormat)
34 {
35 LoadMendel(input);
36 return;
37 }
38
39 int sexCovariate = sexAsCovariate ? GetCovariateID("sex") : -1;
40
41 int textCols = pd.CountTextColumns() + 5;
42 int oldCount = count;
43 bool warn = true;
44 int line = 0;
45
46 String buffer;
47 StringArray tokens;
48
49 while (!ifeof(input))
50 {
51 int field = 0;
52
53 buffer.ReadLine(input);
54
55 tokens.Clear();
56 tokens.AddTokens(buffer, WHITESPACE);
57
58 if (tokens.Length() == 0) continue;
59 if (tokens[0].SlowCompare("end") == 0) break;
60
61 line++;
62
63 if (tokens.Length() < textCols)
64 {
65 if (buffer.Length() > 79)
66 {
67 buffer.SetLength(75);
68 buffer += " ...";
69 }
70
71 String description;
72
73 pd.ColumnSummary(description);
74 error("Loading Pedigree...\n\n"
75 "Expecting %d columns (%s),\n"
76 "but read only %d columns in line %d.\n\n"
77 "The problem line is transcribed below:\n%s\n",
78 textCols, (const char *) description,
79 tokens.Length(), line, (const char *) buffer);
80 }
81
82 if (tokens.Length() > textCols && warn && textCols > 5)
83 {
84 pd.ColumnSummary(buffer);
85 printf("WARNING -- Trailing columns in pedigree file will be ignored\n"
86 " Expecting %d data columns (%s)\n"
87 " However line %d, for example, has %d data columns\n\n",
88 textCols - 5, (const char *) buffer, line, tokens.Length() - 5);
89 warn = false;
90 }
91
92 Person * p;
93
94 // create a new person if necessary
95 if (oldCount==0 || (p = FindPerson(tokens[0], tokens[1], oldCount))==NULL)
96 {
97 if (count == size) Grow();
98
99 p = persons[count++] = new Person;
100 }
101
102 p->famid = tokens[field++]; // famid
103 p->pid = tokens[field++]; // pid
104 p->fatid = tokens[field++]; // fatid
105 p->motid = tokens[field++]; // motid
106
107 bool failure = false;
108 p->sex = TranslateSexCode(tokens[field++], failure);
109 if (failure)
110 error("Can't interpret the sex of individual #%d\n"
111 "Family: %s Individual: %s Sex Code: %s", count,
112 (const char *) p->famid, (const char *) p->pid,
113 (const char *) tokens[field-1]);
114
115 if (sexAsCovariate)
116 {
117 if (p->sex)
118 p->covariates[sexCovariate] = p->sex;
119 else
120 p->covariates[sexCovariate] = _NAN_;
121 }
122
123 for (int col = 0; col < pd.columnCount; col++)
124 switch (pd.columns[col])
125 {
126 case pcAffection :
127 {
128 int a = pd.columnHash[col];
129 int new_status;
130
131 const char * affection = tokens[field++];
132
133 switch (toupper(affection[0]))
134 {
135 case '1' :
136 case 'N' :
137 case 'U' :
138 new_status = 1;
139 break;
140 case '2' :
141 case 'D' :
142 case 'A' :
143 case 'Y' :
144 new_status = 2;
145 break;
146 default :
147 new_status = atoi(affection);
148 if (new_status < 0 || new_status > 2)
149 error("Incorrect formatting for affection status "
150 "Col %d, Affection %s\n"
151 "Family: %s Individual: %s Status: %s",
152 col, (const char *) affectionNames[a],
153 (const char *) p->famid, (const char *) p->pid,
154 affection);
155 }
156 if (new_status != 0 && p->affections[a] != 0 &&
157 new_status != p->affections[a])
158 error("Conflict with previous affection status - "
159 "Col %d, Affection %s\n"
160 "Family: %s Individual: %s Old: %d New: %d",
161 col, (const char *) affectionNames[a],
162 (const char *) p->famid, (const char *) p->pid,
163 p->affections[a], new_status);
164 if (new_status) p->affections[a] = new_status;
165 break;
166 }
167 case pcMarker :
168 {
169 int m = pd.columnHash[col];
170
171 Alleles new_genotype;
172
173 new_genotype[0] = LoadAllele(m, tokens[field++]);
174 new_genotype[1] = LoadAllele(m, tokens[field++]);
175
176 if (p->markers[m].isKnown() && new_genotype.isKnown() &&
177 new_genotype != p->markers[m])
178 {
179 MarkerInfo * info = GetMarkerInfo(m);
180
181 error("Conflict with previous genotype - Col %d, Marker %s\n"
182 "Family: %s Individual: %s Old: %s/%s New: %s/%s",
183 col, (const char *) markerNames[m],
184 (const char *) p->famid, (const char *) p->pid,
185 (const char *) info->GetAlleleLabel(p->markers[m][0]),
186 (const char *) info->GetAlleleLabel(p->markers[m][1]),
187 (const char *) info->GetAlleleLabel(new_genotype[0]),
188 (const char *) info->GetAlleleLabel(new_genotype[1]));
189 }
190
191 if (new_genotype.isKnown()) p->markers[m] = new_genotype;
192 break;
193 }
194 case pcTrait :
195 case pcUndocumentedTraitCovariate :
196 {
197 int t = pd.columnHash[col];
198 double new_pheno = _NAN_;
199
200 if (pd.columns[col] == pcUndocumentedTraitCovariate)
201 t = t / 32768;
202
203 const char * value = tokens[field++];
204 char * flag = NULL;
205
206 if (missing == (const char *) NULL || strcmp(value, missing) != 0)
207 new_pheno = strtod(value, &flag);
208 if (flag != NULL && *flag) new_pheno = _NAN_;
209
210 if (p->traits[t] != _NAN_ && new_pheno != _NAN_ &&
211 new_pheno != p->traits[t])
212 error("Conflict with previous phenotype - Col %d, Trait %s\n"
213 "Family: %s Individual: %s Old: %f New: %f",
214 col, (const char *) traitNames[t],
215 (const char *) p->famid, (const char *) p->pid,
216 p->traits[t], new_pheno);
217
218 if (new_pheno != _NAN_) p->traits[t] = new_pheno;
219 if (pd.columns[col] == pcTrait) break;
220 }
221 case pcCovariate :
222 {
223 int c = pd.columnHash[col];
224 double new_covar = _NAN_;
225
226 if (pd.columns[col] == pcUndocumentedTraitCovariate)
227 {
228 c = c % 32768;
229 field--;
230 }
231
232 const char * value = tokens[field++];
233 char * flag = NULL;
234
235 if (missing == (const char *) NULL || strcmp(value, missing) != 0)
236 new_covar = strtod(value, &flag);
237 if (flag != NULL && *flag) new_covar = _NAN_;
238
239 if (p->covariates[c] != _NAN_ && new_covar != _NAN_ &&
240 new_covar != p->covariates[c])
241 error("Conflict with previous value - Col %d, Covariate %s\n"
242 "Family: %s Individual: %s Old: %f New: %f",
243 col, (const char *) covariateNames[c],
244 (const char *) p->famid, (const char *) p->pid,
245 p->covariates[c], new_covar);
246
247 if (new_covar != _NAN_) p->covariates[c] = new_covar;
248 break;
249 }
250 case pcString :
251 {
252 int c = pd.columnHash[col];
253
254 if (!p->strings[c].IsEmpty() && p->strings[c] != tokens[field])
255 error("Conflict with previous value - Col %d, String %s\n"
256 "Family: %s Individual: %s Old: %s New: %s",
257 col, (const char *) stringNames[c],
258 (const char *) p->famid, (const char *) p->pid,
259 (const char *) p->strings[c], (const char *) tokens[field]);
260
261 p->strings[c] = tokens[field++];
262
263 break;
264 }
265 case pcSkip :
266 field++;
267 break;
268 case pcZygosity :
269 {
270 int new_zygosity;
271
272 const char * zygosity = tokens[field++];
273
274 switch (zygosity[0])
275 {
276 case 'D' :
277 case 'd' :
278 new_zygosity = 2;
279 break;
280 case 'M' :
281 case 'm' :
282 new_zygosity = 1;
283 break;
284 default :
285 new_zygosity = atoi(zygosity);
286 }
287 if (p->zygosity != 0 && new_zygosity != p->zygosity)
288 error("Conflict with previous zygosity - "
289 "Column %d in pedigree\n"
290 "Family: %s Individual: %s Old: %d New: %d\n",
291 col, (const char *) p->famid, (const char *) p->pid,
292 p->zygosity, new_zygosity);
293 p->zygosity = new_zygosity;
294 break;
295 }
296 case pcEnd :
297 break;
298 default :
299 error("Inconsistent Pedigree Description -- Internal Error");
300 }
301 }
302
303 Sort();
304}
305
306void Pedigree::LoadMendel(IFILE & input)
307{
308 // First, retrieve the two format statements from file
309 String familyHeader;
310 String individualRecord;
311
312 familyHeader.ReadLine(input);
313 individualRecord.ReadLine(input);
314
315 // Then create two FORTRAN input streams...
316 // One will be used for retrieving family labels and sizes, the other
317 // will be used for individual information
318 FortranFormat headers, records;
319
320 headers.SetInputFile(input);
321 headers.SetFormat(familyHeader);
322
323 records.SetInputFile(input);
324 records.SetFormat(individualRecord);
325
326 // Storage for key pieces of information
327 String famid;
328 String phenotype;
329 String affectionCode;
330 String affectionStem;
331 int familySize;
332
333 String allele1, allele2;
334
335 int sexCovariate = sexAsCovariate ? GetCovariateID("sex") : -1;
336
337 while (!ifeof(input))
338 {
339 if (count == size)
340 Grow();
341
342 // Retrieve header for next family
343 familySize = headers.GetNextInteger();
344 headers.GetNextField(famid);
345 headers.Flush();
346
347 if (famid.IsEmpty())
348 {
349 if (ifeof(input) && familySize == 0)
350 break;
351 else
352 error("Blank family id encountered\n");
353 }
354
355 // Retrieve each individual in the family
356 for (int i = 0; i < familySize; i++)
357 {
358 Person * p = persons[count++] = new Person;
359
360 // Retrieve basic pedigree structure
361 p->famid = famid;
362 records.GetNextField(p->pid);
363 records.GetNextField(p->fatid);
364 records.GetNextField(p->motid);
365
366 if (p->pid.IsEmpty())
367 error("No unique identifier for individual #%d in family %s\n",
368 i + 1, (const char *) famid);
369
370 if (p->pid.Compare(".") == 0)
371 error("Family %s has an individual named '.', but this code is\n"
372 "reserved to indicate missing parents\n");
373
374 if (p->fatid.IsEmpty()) p->fatid = ".";
375 if (p->motid.IsEmpty()) p->motid = ".";
376
377 // Retrieve and decode sex code
378 char sex = records.GetNextCharacter();
379
380 switch (sex)
381 {
382 case '0' :
383 case 'x' :
384 case 'X' :
385 case '?' :
386 case 0 :
387 p->sex = 0;
388 break;
389 case '1' :
390 case 'm' :
391 case 'M' :
392 p->sex = 1;
393 break;
394 case '2' :
395 case 'f' :
396 case 'F' :
397 p->sex = 2;
398 break;
399 default :
400 error("Can't interpret the sex of individual #%d\n"
401 "Family: %s Individual: %s Sex Code: %s", count,
402 (const char *) p->famid, (const char *) p->pid, sex);
403 };
404
405 if (sexAsCovariate)
406 {
407 if (p->sex)
408 p->covariates[sexCovariate] = p->sex;
409 else
410 p->covariates[sexCovariate] = _NAN_;
411 }
412
413 // Retrieve and decode zygosity
414 char zygosity = records.GetNextCharacter();
415
416 // Mendel uses a unique character to indicate each MZ pair,
417 // we use a unique odd number...
418 if (zygosity)
419 p->zygosity = (zygosity - ' ') * 2 - 1;
420
421 affectionStem.Clear();
422 for (int col = 0; col < pd.columnCount; col++)
423 switch (pd.columns[col])
424 {
425 case pcAffection :
426 {
427 int a = pd.columnHash[col];
428
429 // We expand each Mendel non-codominant trait into multiple
430 // affection status column... First, if this is not a
431 // continuation of a previous expansion we first retrieve
432 // and encode the affection status.
433 if (affectionStem.Length() == 0 ||
434 affectionNames[a].CompareToStem(affectionStem) != 0)
435 {
436 affectionStem.Copy(affectionNames[a], 0, affectionNames[a].FindChar('>') + 1);
437 records.GetNextField(phenotype);
438 affectionCode = affectionStem + phenotype;
439 }
440
441 // Then encode each phenotype appropriately
442 if (phenotype.IsEmpty())
443 p->affections[a] = 0;
444 else
445 p->affections[a] = affectionCode.Compare(affectionNames[a]) == 0 ? 2 : 1;
446
447 break;
448 }
449 case pcMarker :
450 {
451 int m = pd.columnHash[col];
452
453 records.GetNextField(phenotype);
454
455 if (phenotype.IsEmpty())
456 {
457 p->markers[m].one = p->markers[m].two = 0;
458 continue;
459 }
460
461 int separator = phenotype.FindChar('/');
462 if (separator == -1) separator = phenotype.FindChar('|');
463
464 if (separator == -1)
465 error("At marker %s, person %s in family %s has genotype %s.\n"
466 "This genotype is not in the 'al1/al2' format.\n",
467 (const char *) markerNames[m],
468 (const char *) p->pid,
469 (const char *) p->famid,
470 (const char *) phenotype);
471
472 allele1.Copy(phenotype, 0, separator);
473 allele1.Trim();
474
475 allele2.Copy(phenotype, separator + 1, 8);
476 allele2.Trim();
477
478 MarkerInfo * info = GetMarkerInfo(m);
479
480 int one = info->alleleNumbers.Integer(allele1);
481
482 if (one < 0)
483 {
484 if (info->freq.Length() == 0)
485 one = info->NewAllele(allele1);
486 else
487 error("At marker %s, person %s in family %s has genotype %s.\n"
488 "However, '%s' is not a valid allele for this marker.\n",
489 (const char *) markerNames[m],
490 (const char *) p->pid,
491 (const char *) p->famid,
492 (const char *) phenotype,
493 (const char *) allele1);
494 }
495
496 int two = info->alleleNumbers.Integer(allele2);
497
498 if (two < 0)
499 {
500 if (info->freq.Length() == 0)
501 two = info->NewAllele(allele2);
502 else
503 error("At marker %s, person %s in family %s has genotype %s.\n"
504 "However, '%s' is not a valid allele for this marker.\n",
505 (const char *) markerNames[m],
506 (const char *) p->pid,
507 (const char *) p->famid,
508 (const char *) phenotype,
509 (const char *) allele2);
510 }
511
512 p->markers[m].one = one;
513 p->markers[m].two = two;
514 break;
515 }
516 case pcEnd :
517 break;
518 case pcTrait :
519 case pcCovariate :
520 case pcSkip :
521 case pcZygosity :
522 default:
523 error("Inconsistent Pedigree Description -- Internal Error");
524 }
525
526 records.Flush();
527 }
528 }
529
530 Sort();
531}
532
533void Pedigree::Prepare(const char * filename)
534{
535 // Clear any previously loaded pedigree description
536 if (multiPd != NULL)
537 delete [] multiPd;
538
539 multiFileCount = 1;
540
541 // Enable multifile support
542 StringArray filenames;
543
544 filenames.AddColumns(filename, ',');
545
546 if (filenames.Length() <= 1)
547 pd.Load(filename);
548 else
549 {
550 printf("AUTOMATIC MERGE ENABLED: Detected multiple datafile names, separated by commas...\n");
551
552 multiPd = new PedigreeDescription[filenames.Length()];
553
554 for (int i = 0; i < filenames.Length(); i++)
555 {
556 printf(" AUTOMATIC MERGE: Reading data file '%s' ...\n", (const char *) filenames[i]);
557 multiPd[i].Load(filenames[i], false);
558 }
559
560 multiFileCount = filenames.Length();
561 }
562}
563
564void Pedigree::Load(const char * filename, bool allowFailures)
565{
566 if (multiFileCount <= 1)
567 {
568 IFILE f = ifopen(filename, "rb");
569
570 if (f == NULL && allowFailures)
571 return;
572
573 if (f == NULL)
574 error(
575 "The pedigree file %s cannot be opened\n\n"
576 "Common causes for this problem are:\n"
577 " * You might not have used the correct options to specify input file names,\n"
578 " please check the program documentation for information on how to do this\n\n"
579 " * The file doesn't exist or the filename might have been misspelt\n\n"
580 " * The file exists but it is being used by another program which you will need\n"
581 " to close\n\n"
582 " * The file is larger than 2GB and you haven't compiled this application with\n"
583 " large file support.\n\n",
584 filename);
585
586 Load(f);
587 ifclose(f);
588 }
589 else
590 {
591 StringArray filenames;
592
593 filenames.AddColumns(filename, ',');
594
595 if (filenames.Length() != multiFileCount)
596 error("Different numbers of comma separated data and pedigree file names provided\n");
597
598 for (int i = 0; i < filenames.Length(); i++)
599 {
600 printf(" AUTOMATIC MERGE: Datafile '%s' matched to pedigree '%s' ...\n",
601 (const char *) multiPd[i].filename, (const char *) filenames[i]);
602
603 pd = multiPd[i];
604
605 IFILE f = ifopen(filenames[i], "rb");
606
607 if (f == NULL)
608 error("The pedigree file '%s' cannot be opened\n\n",
609 (const char *) filenames[i]);
610
611 Load(f);
612 ifclose(f);
613 }
614
615 printf("\n");
616 }
617}
618
619int Pedigree::TranslateSexCode(const char * code, bool & failure)
620{
621 failure = false;
622
623 switch (code[0])
624 {
625 case 'x' :
626 case 'X' :
627 case '?' :
628 return 0;
629 case '1' :
630 case 'm' :
631 case 'M' :
632 return 1;
633 case '2' :
634 case 'f' :
635 case 'F' :
636 return 2;
637 default :
638 {
639 int result = atoi(code);
640
641 if (result != 0 && result != 1 && result != 2)
642 {
643 failure = true;
644 result = 0;
645 }
646
647 return result;
648 }
649 };
650}
int ifeof(IFILE file)
Check to see if we have reached the EOF (returns 0 if not EOF).
Definition: InputFile.h:654
IFILE ifopen(const char *filename, const char *mode, InputFile::ifileCompression compressionMode=InputFile::DEFAULT)
Open a file with the specified name and mode, using a filename of "-" to indicate stdin/stdout.
Definition: InputFile.h:562
int ifclose(IFILE &file)
Close the file.
Definition: InputFile.h:580
Class for easily reading/writing files without having to worry about file type (uncompressed,...
Definition: InputFile.h:37