Minor changes to oneoff walkers. PlinkRod altered but still commented.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2808 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
21369869b7
commit
935e76daa1
File diff suppressed because it is too large
Load Diff
|
|
@ -6,6 +6,8 @@ import org.broadinstitute.sting.utils.QualityUtils;
|
|||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.Random;
|
||||
|
||||
/**
|
||||
|
|
@ -17,7 +19,7 @@ import java.util.Random;
|
|||
*
|
||||
* @author Kiran Garimella
|
||||
*/
|
||||
public class ReadErrorRateWalker extends ReadWalker<boolean[], int[]> {
|
||||
public class ReadErrorRateWalker extends ReadWalker<boolean[], ReadErrorRateCollection> {
|
||||
@Argument(fullName="printVisualHits", shortName="v", doc="print visual hits", required=false) public boolean printVisualHits = false;
|
||||
@Argument(fullName="useNextBestBase", shortName="nb", doc="use next best base", required=false) public boolean useNextBestBase = false;
|
||||
@Argument(fullName="useNonNextBestBase",shortName="nnb",doc="use nonnext best base",required=false) public boolean useNonNextBestBase = false;
|
||||
|
|
@ -134,38 +136,80 @@ public class ReadErrorRateWalker extends ReadWalker<boolean[], int[]> {
|
|||
*
|
||||
* @return null
|
||||
*/
|
||||
public int[] reduceInit() {
|
||||
return null;
|
||||
public ReadErrorRateCollection reduceInit() {
|
||||
return new ReadErrorRateCollection();
|
||||
}
|
||||
|
||||
/**
|
||||
* Summarize the error rate data.
|
||||
*
|
||||
* @param value the read mismatch array
|
||||
* @param sum the summed mismatch array
|
||||
* @param collection the summed mismatch array
|
||||
* @return the summed mismatch array with the new read mismatch array added
|
||||
*/
|
||||
public int[] reduce(boolean[] value, int[] sum) {
|
||||
if (sum == null) {
|
||||
sum = new int[value.length];
|
||||
}
|
||||
public ReadErrorRateCollection reduce(boolean[] value, ReadErrorRateCollection collection) {
|
||||
|
||||
for (int cycle = 0; cycle < value.length; cycle++) {
|
||||
sum[cycle] += (value[cycle] ? 1 : 0);
|
||||
}
|
||||
collection.update(value);
|
||||
|
||||
return sum;
|
||||
return collection;
|
||||
}
|
||||
|
||||
/**
|
||||
* We've processed all the reads. Emit the final, normalized error rate data.
|
||||
*
|
||||
* @param sum the summed mismatch array
|
||||
* @param collection the summed mismatch arrays
|
||||
*/
|
||||
public void onTraversalDone(int[] sum) {
|
||||
for (int cycle = 0; cycle < sum.length - 1; cycle++) {
|
||||
double errorrate = ((double) sum[cycle])/((double) sum[sum.length - 1]);
|
||||
out.println("[ERROR_RATE] " + cycle + " " + errorrate);
|
||||
}
|
||||
public void onTraversalDone(ReadErrorRateCollection collection) {
|
||||
|
||||
out.print(collection.toString());
|
||||
}
|
||||
}
|
||||
|
||||
class ReadErrorRateCollection {
|
||||
private HashMap<Integer,int[]> readsByReadLength;
|
||||
|
||||
public ReadErrorRateCollection() {
|
||||
readsByReadLength = new HashMap<Integer, int[]>();
|
||||
}
|
||||
|
||||
public void update(boolean[] mismatchArray) {
|
||||
if ( ! readsByReadLength.containsKey(mismatchArray.length) ) {
|
||||
readsByReadLength.put(mismatchArray.length, zeroArray(mismatchArray.length));
|
||||
}
|
||||
|
||||
updateErrorCounts(readsByReadLength.get(mismatchArray.length), mismatchArray);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
for ( int length : readsByReadLength.keySet() ) {
|
||||
for ( int cycle = 0; cycle < length-1; cycle++) {
|
||||
int[] counts = readsByReadLength.get(length);
|
||||
builder.append(length);
|
||||
builder.append("\t");
|
||||
builder.append(cycle);
|
||||
builder.append("\t");
|
||||
builder.append( ( ( double ) counts[cycle] / ( (double) counts[length-1])));
|
||||
builder.append("\n");
|
||||
}
|
||||
}
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
private static int[] zeroArray( int length ) {
|
||||
int[] array = new int[length];
|
||||
for ( int ii = 0; ii < length; ii ++ ) {
|
||||
array[ii] = 0;
|
||||
}
|
||||
|
||||
return array;
|
||||
}
|
||||
|
||||
public static void updateErrorCounts(int[] sum, boolean[] value) {
|
||||
|
||||
for (int cycle = 0; cycle < value.length; cycle++) {
|
||||
sum[cycle] += (value[cycle] ? 1 : 0);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
|
@ -22,11 +22,6 @@ class MultiSampleConcordanceSet {
|
|||
private long truthSitesFilteredOut;
|
||||
private int genotypeQuality;
|
||||
|
||||
private int truePositiveLoci;
|
||||
private int trueNegativeLoci;
|
||||
private int falsePositiveLoci;
|
||||
private int falseNegativeLoci;
|
||||
|
||||
public MultiSampleConcordanceSet(int minDepth, boolean assumeRef, int genotypeQuality) {
|
||||
concordanceSet = new HashSet<VCFConcordanceCalculator>();
|
||||
truthOnlySites = 0l;
|
||||
|
|
|
|||
|
|
@ -58,35 +58,58 @@ public class MultiSampleConcordanceWalker extends RodWalker< LocusConcordanceInf
|
|||
ReferenceOrderedDatum truthData = tracker.lookup("truth", null);
|
||||
ReferenceOrderedDatum variantData = tracker.lookup("variants",null);
|
||||
LocusConcordanceInfo concordance;
|
||||
|
||||
if ( truthData == null && variantData == null) {
|
||||
|
||||
concordance = null;
|
||||
|
||||
} else if ( truthData == null ) {
|
||||
|
||||
// not in the truth set
|
||||
if ( ( (RodVCF) variantData ).isFiltered() ) {
|
||||
|
||||
concordance = null;
|
||||
|
||||
} else {
|
||||
|
||||
concordance = new LocusConcordanceInfo(LocusConcordanceInfo.ConcordanceType.VARIANT_SET,null, ( (RodVCF) variantData ).getRecord(),ref);
|
||||
}
|
||||
|
||||
} else if ( variantData == null ) {
|
||||
|
||||
// not in the variant set
|
||||
if ( ( (RodVCF) truthData).isFiltered() ) {
|
||||
|
||||
concordance = null;
|
||||
|
||||
} else {
|
||||
|
||||
concordance = new LocusConcordanceInfo(LocusConcordanceInfo.ConcordanceType.TRUTH_SET,( (RodVCF) truthData).getRecord(),null,ref);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
// in both
|
||||
// check for filtering
|
||||
boolean truth_filter = ((RodVCF) truthData).isFiltered();
|
||||
boolean call_filter = ((RodVCF) variantData).isFiltered();
|
||||
|
||||
if ( truth_filter && call_filter ) {
|
||||
|
||||
concordance = null;
|
||||
|
||||
} else if ( truth_filter ) {
|
||||
|
||||
concordance = new LocusConcordanceInfo(LocusConcordanceInfo.ConcordanceType.VARIANT_SET,null, ( (RodVCF) variantData ).getRecord(),ref);
|
||||
|
||||
} else if ( call_filter ) {
|
||||
|
||||
concordance = new LocusConcordanceInfo(LocusConcordanceInfo.ConcordanceType.TRUTH_SET_VARIANT_FILTERED,( (RodVCF) truthData).getRecord(), null ,ref);
|
||||
|
||||
} else {
|
||||
|
||||
concordance = new LocusConcordanceInfo(LocusConcordanceInfo.ConcordanceType.BOTH_SETS,( (RodVCF) truthData).getRecord(),( (RodVCF) variantData).getRecord(),ref);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,243 +1,247 @@
|
|||
//package org.broadinstitute.sting.gatk.refdata;
|
||||
//
|
||||
//import org.broadinstitute.sting.BaseTest;
|
||||
//import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
|
||||
//import org.broadinstitute.sting.utils.StingException;
|
||||
//import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
//import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
//import org.junit.BeforeClass;
|
||||
//import org.junit.Test;
|
||||
//import org.junit.Assert;
|
||||
//
|
||||
//import java.io.File;
|
||||
//import java.io.FileNotFoundException;
|
||||
//import java.io.BufferedReader;
|
||||
//import java.io.FileReader;
|
||||
//import java.util.*;
|
||||
//
|
||||
///**
|
||||
// * Created by IntelliJ IDEA.
|
||||
// * User: Ghost
|
||||
// * Date: Jan 22, 2010
|
||||
// * Time: 11:27:33 PM
|
||||
// * To change this template use File | Settings | File Templates.
|
||||
// */
|
||||
//public class PlinkRodTest extends BaseTest {
|
||||
// private static IndexedFastaSequenceFile seq;
|
||||
//
|
||||
// @BeforeClass
|
||||
// public static void beforeTests() {
|
||||
// try {
|
||||
// seq = new IndexedFastaSequenceFile(new File(oneKGLocation + "reference/human_b36_both.fasta"));
|
||||
// } catch (FileNotFoundException e) {
|
||||
// throw new StingException("unable to load the sequence dictionary");
|
||||
// }
|
||||
// GenomeLocParser.setupRefContigOrdering(seq);
|
||||
//
|
||||
// }
|
||||
//
|
||||
// public BufferedReader openFile(String filename) {
|
||||
// try {
|
||||
// return new BufferedReader(new FileReader(filename));
|
||||
// } catch (FileNotFoundException e) {
|
||||
// throw new StingException("Couldn't open file " + filename);
|
||||
// }
|
||||
//
|
||||
// }
|
||||
//
|
||||
// @Test
|
||||
// public void testStandardPedFile() {
|
||||
// PlinkRod rod = new PlinkRod("test");
|
||||
// try {
|
||||
// rod.initialize( new File("/humgen/gsa-hpprojects/GATK/data/Validation_Data/test/plink_rod_test/standard_plink_test.ped") );
|
||||
// } catch ( FileNotFoundException e ) {
|
||||
// throw new StingException("test file for testStandardPedFile() does not exist",e);
|
||||
// }
|
||||
//
|
||||
// // test that the sample names are correct
|
||||
//
|
||||
// List<String> rodNames = rod.getVariantSampleNames();
|
||||
// List<String> expectedNames = Arrays.asList("NA12887","NAMELY","COWBA");
|
||||
//
|
||||
// Assert.assertEquals("That there are as many samples in the rod as in the expected list",expectedNames.size(),rodNames.size());
|
||||
//
|
||||
// boolean namesCorrect = true;
|
||||
// for ( int i = 0; i < expectedNames.size(); i++ ) {
|
||||
// namesCorrect = namesCorrect && ( rodNames.get(i).equals(expectedNames.get(i)) );
|
||||
// }
|
||||
//
|
||||
// Assert.assertTrue("That the names are correct and in the proper order",namesCorrect);
|
||||
//
|
||||
// // test that rod can be iterated over
|
||||
//
|
||||
// ArrayList<ArrayList<Genotype>> genotypesInRod = new ArrayList<ArrayList<Genotype>>();
|
||||
// ArrayList<ArrayList<String>> sampleNamesInRod = new ArrayList<ArrayList<String>>();
|
||||
// ArrayList<GenomeLoc> lociInRod = new ArrayList<GenomeLoc>();
|
||||
// do {
|
||||
// genotypesInRod.add(rod.getGenotypes());
|
||||
// sampleNamesInRod.add(rod.getVariantSampleNames());
|
||||
// lociInRod.add(rod.getLocation());
|
||||
// } while ( rod.parseLine(null,null) );
|
||||
//
|
||||
// Assert.assertEquals("That there are three SNPs in the ROD",3,genotypesInRod.size());
|
||||
//
|
||||
// ArrayList<Genotype> snp1 = genotypesInRod.get(0);
|
||||
// ArrayList<Genotype> snp3 = genotypesInRod.get(2);
|
||||
//
|
||||
// Assert.assertEquals("That there are three Genotypes in SNP 1",3,snp1.size());
|
||||
// Assert.assertEquals("That there are three samples in SNP 2", 3, sampleNamesInRod.get(1).size());
|
||||
// Assert.assertEquals("That there are three Genotypes in SNP 3",3,snp3.size());
|
||||
//
|
||||
//
|
||||
// List<Allele> snp1_individual3_alleles = snp1.get(2).getAlleles();
|
||||
// List<Allele> snp3_individual2_alleles = snp3.get(1).getAlleles();
|
||||
//
|
||||
// String alleleStr1 = snp1_individual3_alleles.get(0).getBases()+snp1_individual3_alleles.get(1).getBases();
|
||||
// String alleleStr2 = snp3_individual2_alleles.get(0).getBases()+snp3_individual2_alleles.get(1).getBases();
|
||||
//
|
||||
// Assert.assertEquals("That the third genotype of snp 1 is correctly no-call","00",alleleStr1);
|
||||
// Assert.assertEquals("That the second genotype of snp 3 is correctly G G","GG",alleleStr2);
|
||||
//
|
||||
// boolean name2isSame = true;
|
||||
//
|
||||
// for ( ArrayList<String> names : sampleNamesInRod ) {
|
||||
// name2isSame = name2isSame && names.get(1).equals("NAMELY");
|
||||
// }
|
||||
//
|
||||
// Assert.assertTrue("That the second name of all the genotypes is the same and is correct",name2isSame);
|
||||
//
|
||||
// // test that the loci are correctly parsed and in order
|
||||
//
|
||||
// List<String> expectedLoci = Arrays.asList("1:123456","2:13274","3:11111");
|
||||
// boolean lociCorrect = true;
|
||||
// for ( int i = 0; i < 3; i ++ ) {
|
||||
// lociCorrect = lociCorrect && lociInRod.get(i).toString().equals(expectedLoci.get(i));
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// @Test
|
||||
// public void testStandardPedFileWithIndels() {
|
||||
// PlinkRod rod = new PlinkRod("test");
|
||||
// try {
|
||||
// rod.initialize(new File("/humgen/gsa-hpprojects/GATK/data/Validation_Data/test/plink_rod_test/standard_plink_with_indels.ped") );
|
||||
// } catch ( FileNotFoundException e) {
|
||||
// throw new StingException("Test file for testStandardPedFileWithIndels() could not be found", e);
|
||||
// }
|
||||
//
|
||||
// // Iterate through the rod
|
||||
//
|
||||
// List<ArrayList<Genotype>> genotypesInRod = new ArrayList<ArrayList<Genotype>>();
|
||||
// ArrayList<ArrayList<String>> sampleNamesInRod = new ArrayList<ArrayList<String>>();
|
||||
// ArrayList<GenomeLoc> lociInRod = new ArrayList<GenomeLoc>();
|
||||
// ArrayList<Boolean> snpSites = new ArrayList<Boolean>();
|
||||
// do {
|
||||
// genotypesInRod.add(rod.getGenotypes());
|
||||
// sampleNamesInRod.add(rod.getVariantSampleNames());
|
||||
// lociInRod.add(rod.getLocation());
|
||||
// snpSites.add(rod.variantIsSNP());
|
||||
// } while ( rod.parseLine(null,null) );
|
||||
//
|
||||
// boolean snpOrder = true;
|
||||
// List<Boolean> expectedOrder = Arrays.asList(true,false,true,false);
|
||||
// for ( int i = 0; i < 4; i ++ ) {
|
||||
// snpOrder = snpOrder && ( expectedOrder.get(i) == snpSites.get(i) );
|
||||
// }
|
||||
//
|
||||
// Assert.assertTrue("That the variant type order is as expected", snpOrder);
|
||||
// Assert.assertTrue("That the second genotype of second variant is not a point mutation", ! genotypesInRod.get(1).get(1).isPointGenotype() );
|
||||
// Assert.assertTrue("That the second genotype of fourth variant is not a point mutation", ! genotypesInRod.get(3).get(1).isPointGenotype() );
|
||||
// Assert.assertTrue("That the second genotype of fourth variant is homozygous", genotypesInRod.get(3).get(1).isHom());
|
||||
// Assert.assertTrue("That the fourth genotype of fourth variant is heterozygous",genotypesInRod.get(3).get(3).isHet());
|
||||
// Assert.assertEquals("That the reference deletion genotype has the correct string", "ATTTAT",genotypesInRod.get(3).get(2).getAlleles().get(0).getBases());
|
||||
// Assert.assertEquals("That the insertion bases are correct","CTC",genotypesInRod.get(1).get(2).getAlleles().get(0).getBases());
|
||||
// Assert.assertEquals("That the snp bases are correct","GC",genotypesInRod.get(2).get(2).getAlleles().get(0).getBases()+genotypesInRod.get(2).get(2).getAlleles().get(1).getBases());
|
||||
// }
|
||||
//
|
||||
// @Test
|
||||
// public void testBinaryPedFileNoIndels() {
|
||||
// PlinkRod rod = new PlinkRod("binaryTest1");
|
||||
// try {
|
||||
// rod.initialize(new File("/humgen/gsa-hpprojects/GATK/data/Validation_Data/test/plink_rod_test/binary_noindel_test.bed"));
|
||||
// } catch (FileNotFoundException e) {
|
||||
// throw new StingException("Test file for testBinaryPedFileNoIndels() could not be found",e);
|
||||
// }
|
||||
//
|
||||
// // iterate through the ROD and get stuff
|
||||
// ArrayList<GenomeLoc> lociInRod = new ArrayList<GenomeLoc>();
|
||||
// ArrayList<ArrayList<Genotype>> genotypesInRod = new ArrayList<ArrayList<Genotype>>();
|
||||
// ArrayList<ArrayList<String>> samplesInRod = new ArrayList<ArrayList<String>>();
|
||||
//
|
||||
// do {
|
||||
// lociInRod.add(rod.getLocation());
|
||||
// genotypesInRod.add(rod.getGenotypes());
|
||||
// samplesInRod.add(rod.getVariantSampleNames());
|
||||
// } while ( rod.parseLine(null,null) );
|
||||
//
|
||||
// List<String> expecLoc = Arrays.asList("1:123456","1:14327877","2:22074511","3:134787","3:178678","4:829645","4:5234132","12:1268713");
|
||||
//
|
||||
// for ( int i = 0; i < expecLoc.size(); i ++ ) {
|
||||
// Assert.assertEquals("That locus "+(i+1)+" in the rod is correct", expecLoc.get(i), lociInRod.get(i).toString());
|
||||
// }
|
||||
//
|
||||
// List<String> expecAlleles = Arrays.asList("AA","AA","AA","GG","GG","GG","AA","TA","TT","CC","CC","GC","TC","CC","TT",
|
||||
// "GG","GG","AG","TT","CC","CT","TG","GG","GG");
|
||||
// List<Boolean> expecHet = Arrays.asList(false,false,false,false,false,false,false,true,false,false,false,true,true,false,
|
||||
// false,false,false,true,false,false,true,true,false,false);
|
||||
// List<String> expecName = Arrays.asList("NA12878","NA12890","NA07000","NA12878","NA12890","NA07000","NA12878","NA12890","NA07000",
|
||||
// "NA12878","NA12890","NA07000","NA12878","NA12890","NA07000","NA12878","NA12890","NA07000","NA12878","NA12890","NA07000",
|
||||
// "NA12878","NA12890","NA07000");
|
||||
// int snpNo = 1;
|
||||
// int indiv = 1;
|
||||
// int alleleOffset = 0;
|
||||
// for ( ArrayList<Genotype> snp : genotypesInRod ) {
|
||||
// for ( Genotype gen : snp ) {
|
||||
// String alStr = gen.getAlleles().get(0).getBases()+gen.getAlleles().get(1).getBases();
|
||||
// Assert.assertEquals("That the allele of person "+indiv+" for snp "+snpNo+" is correct "+
|
||||
// "(allele offset "+alleleOffset+")", expecAlleles.get(alleleOffset),alStr);
|
||||
// Assert.assertEquals("That the genotype of person "+indiv+" for snp "+snpNo+" is properly set", expecHet.get(alleleOffset),gen.isHet());
|
||||
// Assert.assertEquals("That the name of person "+indiv+" for snp "+snpNo+" is correct", expecName.get(alleleOffset),samplesInRod.get(snpNo-1).get(indiv-1));
|
||||
// indiv++;
|
||||
// alleleOffset++;
|
||||
// }
|
||||
// indiv = 1;
|
||||
// snpNo++;
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// @Test
|
||||
// public void testIndelBinary() {
|
||||
// PlinkRod rod = new PlinkRod("binaryTest2");
|
||||
// try {
|
||||
// rod.initialize(new File("/humgen/gsa-hpprojects/GATK/data/Validation_Data/test/plink_rod_test/binary_indel_test.bed"));
|
||||
// } catch (FileNotFoundException e) {
|
||||
// throw new StingException("Test file for testBinaryPedFileNoIndels() could not be found",e);
|
||||
// }
|
||||
//
|
||||
// ArrayList<ArrayList<Genotype>> genotypesInRod = new ArrayList<ArrayList<Genotype>>();
|
||||
// do {
|
||||
// genotypesInRod.add(rod.getGenotypes());
|
||||
// } while ( rod.parseLine(null,null) );
|
||||
//
|
||||
// List<String> expecAlleles = Arrays.asList("ACCA","","ACCAACCA","GGGG","GG","","AA","TA","00","","CCTCCT","CCT",
|
||||
// "TC","CC","TT","GG","GG","AG","","CTTGCTTG","CTTG","TG","GG","GG");
|
||||
// List<Boolean> expecDeletion = Arrays.asList(false,false,false,false,false,false,false,false,false,true,false,true,
|
||||
// false,false,false,false,false,false,true,false,true,false,false,false);
|
||||
// List<Boolean> expecInsertion = Arrays.asList(true,false,true,true,true,false,false,false,false,false,false,false,
|
||||
// false,false,false,false,false,false,false,false,false,false,false,false);
|
||||
//
|
||||
// int al = 0;
|
||||
// for ( ArrayList<Genotype> snp : genotypesInRod ) {
|
||||
// for ( Genotype gen : snp ) {
|
||||
// String alStr = gen.getAlleles().get(0).getBases()+gen.getAlleles().get(1).getBases();
|
||||
// Allele firstAl = gen.getAlleles().get(0);
|
||||
// Allele secondAl = gen.getAlleles().get(1);
|
||||
// boolean isInsertion = ( firstAl.getType() == Allele.AlleleType.INSERTION || secondAl.getType() == Allele.AlleleType.INSERTION );
|
||||
// boolean isDeletion = ( firstAl.getType() == Allele.AlleleType.DELETION || secondAl.getType() == Allele.AlleleType.DELETION );
|
||||
// Assert.assertEquals("That the indel file has the correct alleles for genotype "+al,expecAlleles.get(al), alStr);
|
||||
// Assert.assertEquals("That the insertion property of genotype "+al+" is correct",expecInsertion.get(al),isInsertion);
|
||||
// Assert.assertEquals("That the deletion property of genotype "+al+" is correct", expecDeletion.get(al), isDeletion);
|
||||
// al++;
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
//}
|
||||
package org.broadinstitute.sting.gatk.refdata;
|
||||
/*
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.oneoffprojects.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.oneoffprojects.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
import org.junit.Assert;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.FileReader;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: chartl
|
||||
* Date: Jan 22, 2010
|
||||
* Time: 11:27:33 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*
|
||||
public class PlinkRodTest extends BaseTest {
|
||||
// todo :: get the isIndel() isInsertion() and isDeletion() tests working again -- this may require new
|
||||
// todo :: methods in the objects themselves
|
||||
private static IndexedFastaSequenceFile seq;
|
||||
|
||||
@BeforeClass
|
||||
public static void beforeTests() {
|
||||
try {
|
||||
seq = new IndexedFastaSequenceFile(new File(oneKGLocation + "reference/human_b36_both.fasta"));
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new StingException("unable to load the sequence dictionary");
|
||||
}
|
||||
GenomeLocParser.setupRefContigOrdering(seq);
|
||||
|
||||
}
|
||||
|
||||
public BufferedReader openFile(String filename) {
|
||||
try {
|
||||
return new BufferedReader(new FileReader(filename));
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new StingException("Couldn't open file " + filename);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testStandardPedFile() {
|
||||
PlinkRod rod = new PlinkRod("test");
|
||||
try {
|
||||
rod.initialize( new File("/humgen/gsa-hpprojects/GATK/data/Validation_Data/test/plink_rod_test/standard_plink_test.ped") );
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new StingException("test file for testStandardPedFile() does not exist",e);
|
||||
}
|
||||
|
||||
// test that the sample names are correct
|
||||
|
||||
List<String> rodNames = rod.getVariantSampleNames();
|
||||
List<String> expectedNames = Arrays.asList("NA12887","NAMELY","COWBA");
|
||||
|
||||
Assert.assertEquals("That there are as many samples in the rod as in the expected list",expectedNames.size(),rodNames.size());
|
||||
|
||||
boolean namesCorrect = true;
|
||||
for ( int i = 0; i < expectedNames.size(); i++ ) {
|
||||
namesCorrect = namesCorrect && ( rodNames.get(i).equals(expectedNames.get(i)) );
|
||||
}
|
||||
|
||||
Assert.assertTrue("That the names are correct and in the proper order",namesCorrect);
|
||||
|
||||
// test that rod can be iterated over
|
||||
|
||||
ArrayList<ArrayList<Genotype>> genotypesInRod = new ArrayList<ArrayList<Genotype>>();
|
||||
ArrayList<ArrayList<String>> sampleNamesInRod = new ArrayList<ArrayList<String>>();
|
||||
ArrayList<GenomeLoc> lociInRod = new ArrayList<GenomeLoc>();
|
||||
do {
|
||||
genotypesInRod.add(rod.getGenotypes());
|
||||
sampleNamesInRod.add(rod.getVariantSampleNames());
|
||||
lociInRod.add(rod.getLocation());
|
||||
} while ( rod.parseLine(null,null) );
|
||||
|
||||
Assert.assertEquals("That there are three SNPs in the ROD",3,genotypesInRod.size());
|
||||
|
||||
ArrayList<Genotype> snp1 = genotypesInRod.get(0);
|
||||
ArrayList<Genotype> snp3 = genotypesInRod.get(2);
|
||||
|
||||
Assert.assertEquals("That there are three Genotypes in SNP 1",3,snp1.size());
|
||||
Assert.assertEquals("That there are three samples in SNP 2", 3, sampleNamesInRod.get(1).size());
|
||||
Assert.assertEquals("That there are three Genotypes in SNP 3",3,snp3.size());
|
||||
|
||||
|
||||
List<Allele> snp1_individual3_alleles = snp1.get(2).getAlleles();
|
||||
List<Allele> snp3_individual2_alleles = snp3.get(1).getAlleles();
|
||||
|
||||
String alleleStr1 = new String(snp1_individual3_alleles.get(0).getBases())+new String(snp1_individual3_alleles.get(1).getBases());
|
||||
String alleleStr2 = new String(snp3_individual2_alleles.get(0).getBases())+new String(snp3_individual2_alleles.get(1).getBases());
|
||||
|
||||
Assert.assertEquals("That the third genotype of snp 1 is correctly no-call","00",alleleStr1);
|
||||
Assert.assertEquals("That the second genotype of snp 3 is correctly G G","GG",alleleStr2);
|
||||
|
||||
boolean name2isSame = true;
|
||||
|
||||
for ( ArrayList<String> names : sampleNamesInRod ) {
|
||||
name2isSame = name2isSame && names.get(1).equals("NAMELY");
|
||||
}
|
||||
|
||||
Assert.assertTrue("That the second name of all the genotypes is the same and is correct",name2isSame);
|
||||
|
||||
// test that the loci are correctly parsed and in order
|
||||
|
||||
List<String> expectedLoci = Arrays.asList("1:123456","2:13274","3:11111");
|
||||
boolean lociCorrect = true;
|
||||
for ( int i = 0; i < 3; i ++ ) {
|
||||
lociCorrect = lociCorrect && lociInRod.get(i).toString().equals(expectedLoci.get(i));
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testStandardPedFileWithIndels() {
|
||||
PlinkRod rod = new PlinkRod("test");
|
||||
try {
|
||||
rod.initialize(new File("/humgen/gsa-hpprojects/GATK/data/Validation_Data/test/plink_rod_test/standard_plink_with_indels.ped") );
|
||||
} catch ( FileNotFoundException e) {
|
||||
throw new StingException("Test file for testStandardPedFileWithIndels() could not be found", e);
|
||||
}
|
||||
|
||||
// Iterate through the rod
|
||||
|
||||
List<ArrayList<Genotype>> genotypesInRod = new ArrayList<ArrayList<Genotype>>();
|
||||
ArrayList<ArrayList<String>> sampleNamesInRod = new ArrayList<ArrayList<String>>();
|
||||
ArrayList<GenomeLoc> lociInRod = new ArrayList<GenomeLoc>();
|
||||
ArrayList<Boolean> snpSites = new ArrayList<Boolean>();
|
||||
do {
|
||||
genotypesInRod.add(rod.getGenotypes());
|
||||
sampleNamesInRod.add(rod.getVariantSampleNames());
|
||||
lociInRod.add(rod.getLocation());
|
||||
snpSites.add(rod.variantIsSNP());
|
||||
} while ( rod.parseLine(null,null) );
|
||||
|
||||
boolean snpOrder = true;
|
||||
List<Boolean> expectedOrder = Arrays.asList(true,false,true,false);
|
||||
for ( int i = 0; i < 4; i ++ ) {
|
||||
snpOrder = snpOrder && ( expectedOrder.get(i) == snpSites.get(i) );
|
||||
}
|
||||
|
||||
Assert.assertTrue("That the variant type order is as expected", snpOrder);
|
||||
// Assert.assertTrue("That the second genotype of second variant is not a point mutation", ! genotypesInRod.get(1).get(1). );
|
||||
// Assert.assertTrue("That the second genotype of fourth variant is not a point mutation", ! genotypesInRod.get(3).get(1).isPointGenotype() );
|
||||
Assert.assertTrue("That the second genotype of fourth variant is homozygous", genotypesInRod.get(3).get(1).isHom());
|
||||
Assert.assertTrue("That the fourth genotype of fourth variant is heterozygous",genotypesInRod.get(3).get(3).isHet());
|
||||
Assert.assertEquals("That the reference deletion genotype has the correct string", "ATTTAT",genotypesInRod.get(3).get(2).getAlleles().get(0).getBases());
|
||||
Assert.assertEquals("That the insertion bases are correct","CTC",genotypesInRod.get(1).get(2).getAlleles().get(0).getBases());
|
||||
Assert.assertEquals("That the snp bases are correct","GC",new String(genotypesInRod.get(2).get(2).getAlleles().get(0).getBases())+new String(genotypesInRod.get(2).get(2).getAlleles().get(1).getBases()));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBinaryPedFileNoIndels() {
|
||||
PlinkRod rod = new PlinkRod("binaryTest1");
|
||||
try {
|
||||
rod.initialize(new File("/humgen/gsa-hpprojects/GATK/data/Validation_Data/test/plink_rod_test/binary_noindel_test.bed"));
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new StingException("Test file for testBinaryPedFileNoIndels() could not be found",e);
|
||||
}
|
||||
|
||||
// iterate through the ROD and get stuff
|
||||
ArrayList<GenomeLoc> lociInRod = new ArrayList<GenomeLoc>();
|
||||
ArrayList<ArrayList<Genotype>> genotypesInRod = new ArrayList<ArrayList<Genotype>>();
|
||||
ArrayList<ArrayList<String>> samplesInRod = new ArrayList<ArrayList<String>>();
|
||||
|
||||
do {
|
||||
lociInRod.add(rod.getLocation());
|
||||
genotypesInRod.add(rod.getGenotypes());
|
||||
samplesInRod.add(rod.getVariantSampleNames());
|
||||
} while ( rod.parseLine(null,null) );
|
||||
|
||||
List<String> expecLoc = Arrays.asList("1:123456","1:14327877","2:22074511","3:134787","3:178678","4:829645","4:5234132","12:1268713");
|
||||
|
||||
for ( int i = 0; i < expecLoc.size(); i ++ ) {
|
||||
Assert.assertEquals("That locus "+(i+1)+" in the rod is correct", expecLoc.get(i), lociInRod.get(i).toString());
|
||||
}
|
||||
|
||||
List<String> expecAlleles = Arrays.asList("AA","AA","AA","GG","GG","GG","AA","TA","TT","CC","CC","GC","TC","CC","TT",
|
||||
"GG","GG","AG","TT","CC","CT","TG","GG","GG");
|
||||
List<Boolean> expecHet = Arrays.asList(false,false,false,false,false,false,false,true,false,false,false,true,true,false,
|
||||
false,false,false,true,false,false,true,true,false,false);
|
||||
List<String> expecName = Arrays.asList("NA12878","NA12890","NA07000","NA12878","NA12890","NA07000","NA12878","NA12890","NA07000",
|
||||
"NA12878","NA12890","NA07000","NA12878","NA12890","NA07000","NA12878","NA12890","NA07000","NA12878","NA12890","NA07000",
|
||||
"NA12878","NA12890","NA07000");
|
||||
int snpNo = 1;
|
||||
int indiv = 1;
|
||||
int alleleOffset = 0;
|
||||
for ( ArrayList<Genotype> snp : genotypesInRod ) {
|
||||
for ( Genotype gen : snp ) {
|
||||
String alStr = new String(gen.getAlleles().get(0).getBases())+new String(gen.getAlleles().get(1).getBases());
|
||||
Assert.assertEquals("That the allele of person "+indiv+" for snp "+snpNo+" is correct "+
|
||||
"(allele offset "+alleleOffset+")", expecAlleles.get(alleleOffset),alStr);
|
||||
Assert.assertEquals("That the genotype of person "+indiv+" for snp "+snpNo+" is properly set", expecHet.get(alleleOffset),gen.isHet());
|
||||
Assert.assertEquals("That the name of person "+indiv+" for snp "+snpNo+" is correct", expecName.get(alleleOffset),samplesInRod.get(snpNo-1).get(indiv-1));
|
||||
indiv++;
|
||||
alleleOffset++;
|
||||
}
|
||||
indiv = 1;
|
||||
snpNo++;
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testIndelBinary() {
|
||||
PlinkRod rod = new PlinkRod("binaryTest2");
|
||||
try {
|
||||
rod.initialize(new File("/humgen/gsa-hpprojects/GATK/data/Validation_Data/test/plink_rod_test/binary_indel_test.bed"));
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new StingException("Test file for testBinaryPedFileNoIndels() could not be found",e);
|
||||
}
|
||||
|
||||
ArrayList<ArrayList<Genotype>> genotypesInRod = new ArrayList<ArrayList<Genotype>>();
|
||||
do {
|
||||
genotypesInRod.add(rod.getGenotypes());
|
||||
} while ( rod.parseLine(null,null) );
|
||||
|
||||
List<String> expecAlleles = Arrays.asList("ACCA","","ACCAACCA","GGGG","GG","","AA","TA","00","","CCTCCT","CCT",
|
||||
"TC","CC","TT","GG","GG","AG","","CTTGCTTG","CTTG","TG","GG","GG");
|
||||
List<Boolean> expecDeletion = Arrays.asList(false,false,false,false,false,false,false,false,false,true,false,true,
|
||||
false,false,false,false,false,false,true,false,true,false,false,false);
|
||||
List<Boolean> expecInsertion = Arrays.asList(true,false,true,true,true,false,false,false,false,false,false,false,
|
||||
false,false,false,false,false,false,false,false,false,false,false,false);
|
||||
|
||||
int al = 0;
|
||||
for ( ArrayList<Genotype> snp : genotypesInRod ) {
|
||||
for ( Genotype gen : snp ) {
|
||||
String alStr = new String(gen.getAlleles().get(0).getBases())+new String(gen.getAlleles().get(1).getBases());
|
||||
Allele firstAl = gen.getAlleles().get(0);
|
||||
Allele secondAl = gen.getAlleles().get(1);
|
||||
// boolean isInsertion = ( firstAl.getType() == Allele.AlleleType.INSERTION || secondAl.getType() == Allele.AlleleType.INSERTION );
|
||||
// boolean isDeletion = ( firstAl.getType() == Allele.AlleleType.DELETION || secondAl.getType() == Allele.AlleleType.DELETION );
|
||||
Assert.assertEquals("That the indel file has the correct alleles for genotype "+al,expecAlleles.get(al), alStr);
|
||||
// Assert.assertEquals("That the insertion property of genotype "+al+" is correct",expecInsertion.get(al),isInsertion);
|
||||
// Assert.assertEquals("That the deletion property of genotype "+al+" is correct", expecDeletion.get(al), isDeletion);
|
||||
al++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}*/
|
||||
|
|
|
|||
Loading…
Reference in New Issue