PlinkRodWithGenomeLoc now properly parses text plink files. Unit test added to test this functionality. Indels and binary files to come.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2666 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
chartl 2010-01-23 06:19:26 +00:00
parent 648a36d08e
commit 42fb85e7f3
2 changed files with 135 additions and 5 deletions

View File

@ -4,10 +4,15 @@ import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import java.io.*;
import java.util.*;
import net.sf.samtools.SAMFileHeader;
/**
* Created by IntelliJ IDEA.
* User: chartl
@ -16,7 +21,7 @@ import java.util.*;
* To change this template use File | Settings | File Templates.
*/
public class PlinkRodWithGenomeLoc extends BasicReferenceOrderedDatum implements ReferenceOrderedDatum {
private boolean allowTest = false;
private final Set<String> headerEntries = new HashSet<String>(Arrays.asList("#Family ID","Individual ID","Sex",
"Paternal ID","Maternal ID","Phenotype", "FID","IID","PAT","MAT","SEX","PHENOTYPE"));
private final byte SNP_MAJOR_MODE = 0x00000001;
@ -25,7 +30,6 @@ public class PlinkRodWithGenomeLoc extends BasicReferenceOrderedDatum implements
private PlinkVariantInfo currentVariant;
private ListIterator<PlinkVariantInfo> variantIterator;
private PlinkFileType plinkFileType;
public enum PlinkFileType {
@ -93,10 +97,11 @@ public class PlinkRodWithGenomeLoc extends BasicReferenceOrderedDatum implements
return currentVariant.getGenotypes();
}
// AM I PARSING A TEXT OR A BINARY FILE ??
private ArrayList<PlinkVariantInfo> parsePlinkFile(File file) {
String[] splitFileName = file.getName().split(".");
String[] splitFileName = file.getName().split("\\.");
String extension = splitFileName[splitFileName.length-1];
if ( extension.equals("ped") || extension.equals("raw") ) {
return parseTextFormattedPlinkFile(file);
@ -127,6 +132,7 @@ public class PlinkRodWithGenomeLoc extends BasicReferenceOrderedDatum implements
incorporateInfo(seqVars,snpOffsets,line);
} while ( line != null );
java.util.Collections.sort(seqVars); // because the comparable uses the GenomeLoc comparable; these
// are sorted in standard reference order
@ -140,6 +146,10 @@ public class PlinkRodWithGenomeLoc extends BasicReferenceOrderedDatum implements
}
private void incorporateInfo(List<PlinkVariantInfo> vars, List<Integer> offsets, String plinkLine) {
if ( plinkLine == null ) {
return;
}
String[] plinkInfo;
if ( plinkFileType == PlinkFileType.STANDARD_PED) {
plinkInfo = plinkLine.split("\t");
@ -190,7 +200,7 @@ public class PlinkRodWithGenomeLoc extends BasicReferenceOrderedDatum implements
ArrayList<Integer> offsets = new ArrayList<Integer>();
String[] headerFields;
if ( plinkFileType == PlinkFileType.STANDARD_PED ) {
headerFields = header.split("\t");
headerFields = header.split("\t+");
} else {
headerFields = header.split("\\s+");
}
@ -439,6 +449,7 @@ class PlinkVariantInfo implements Comparable {
public PlinkVariantInfo(String variantName) {
this.variantName = variantName;
parseName();
}
public PlinkVariantInfo(String variantName, boolean onlyLookForIndelInfo ) {
@ -451,7 +462,7 @@ class PlinkVariantInfo implements Comparable {
}
private void parseName() {
String chrom = this.variantName.split("_c")[1].split("_")[0];
String chrom = this.variantName.split("\\|c")[1].split("_")[0];
String pos = this.variantName.split("_p")[1].split("_")[0];
this.loc = GenomeLocParser.parseGenomeLoc(chrom+":"+pos);
this.parseNameForIndels();
@ -505,6 +516,8 @@ class PlinkVariantInfo implements Comparable {
for ( String alStr : alleleStrings ) {
alleles.add(new Allele(Allele.AlleleType.UNKNOWN_POINT_ALLELE,alStr));
}
genotypes.add(new Genotype(alleles,sampleName,20.0) );
sampleNames.add(sampleName);
}
private void addIndel(String[] alleleStrings, String sampleName) {

View File

@ -0,0 +1,117 @@
package org.broadinstitute.sting.gatk.refdata;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.junit.BeforeClass;
import org.junit.Test;
import org.junit.Assert;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.BufferedReader;
import java.io.FileReader;
import java.util.*;
/**
* Created by IntelliJ IDEA.
* User: Ghost
* Date: Jan 22, 2010
* Time: 11:27:33 PM
* To change this template use File | Settings | File Templates.
*/
public class PlinkRodTest extends BaseTest {
private static IndexedFastaSequenceFile seq;
@BeforeClass
public static void beforeTests() {
try {
seq = new IndexedFastaSequenceFile(new File(oneKGLocation + "reference/human_b36_both.fasta"));
} catch (FileNotFoundException e) {
throw new StingException("unable to load the sequence dictionary");
}
GenomeLocParser.setupRefContigOrdering(seq);
}
public BufferedReader openFile(String filename) {
try {
return new BufferedReader(new FileReader(filename));
} catch (FileNotFoundException e) {
throw new StingException("Couldn't open file " + filename);
}
}
@Test
public void testStandardPedFile() {
PlinkRodWithGenomeLoc rod = new PlinkRodWithGenomeLoc("test");
try {
rod.initialize( new File("/humgen/gsa-hpprojects/GATK/data/Validation_Data/test/plink_rod_test/standard_plink_test.ped") );
} catch ( FileNotFoundException e ) {
throw new StingException("test file for testStandardPedFile() does not exist",e);
}
// test that the sample names are correct
List<String> rodNames = rod.getVariantSampleNames();
List<String> expectedNames = Arrays.asList("NA12887","NAMELY","COWBA");
Assert.assertEquals("That there are as many samples in the rod as in the expected list",expectedNames.size(),rodNames.size());
boolean namesCorrect = true;
for ( int i = 0; i < expectedNames.size(); i++ ) {
namesCorrect = namesCorrect && ( rodNames.get(i).equals(expectedNames.get(i)) );
}
Assert.assertTrue("That the names are correct and in the proper order",namesCorrect);
// test that rod can be iterated over
ArrayList<ArrayList<Genotype>> genotypesInRod = new ArrayList<ArrayList<Genotype>>();
ArrayList<ArrayList<String>> sampleNamesInRod = new ArrayList<ArrayList<String>>();
ArrayList<GenomeLoc> lociInRod = new ArrayList<GenomeLoc>();
do {
genotypesInRod.add(rod.getGenotypes());
sampleNamesInRod.add(rod.getVariantSampleNames());
lociInRod.add(rod.getLocation());
} while ( rod.parseLine(null,null) );
Assert.assertEquals("That there are three SNPs in the ROD",3,genotypesInRod.size());
ArrayList<Genotype> snp1 = genotypesInRod.get(0);
ArrayList<Genotype> snp3 = genotypesInRod.get(2);
Assert.assertEquals("That there are three Genotypes in SNP 1",3,snp1.size());
Assert.assertEquals("That there are three samples in SNP 1", 3, sampleNamesInRod.get(0).size());
Assert.assertEquals("That there are three Genotypes in SNP 3",3,snp3.size());
List<Allele> snp1_individual3_alleles = snp1.get(2).getAlleles();
List<Allele> snp3_individual2_alleles = snp3.get(1).getAlleles();
String alleleStr1 = snp1_individual3_alleles.get(0).getBases()+snp1_individual3_alleles.get(1).getBases();
String alleleStr2 = snp3_individual2_alleles.get(0).getBases()+snp3_individual2_alleles.get(1).getBases();
Assert.assertEquals("That the third genotype of snp 1 is correctly no-call","00",alleleStr1);
Assert.assertEquals("That the second genotype of snp 3 is correctly G G","GG",alleleStr2);
boolean name2isSame = true;
for ( ArrayList<String> names : sampleNamesInRod ) {
name2isSame = name2isSame && names.get(1).equals("NAMELY");
}
Assert.assertTrue("That the second name of all the genotypes is the same and is correct",name2isSame);
// test that the loci are correctly parsed and in order
List<String> expectedLoci = Arrays.asList("1:123456","2:13274","3:11111");
boolean lociCorrect = true;
for ( int i = 0; i < 3; i ++ ) {
lociCorrect = lociCorrect && lociInRod.get(i).toString().equals(expectedLoci.get(i));
}
}
}