Added - a walker that outputs relevant information about false negatives given a bunch of hapmap individuals and corresponding integration tests for it.

This will output for hapmap variant sites:

chromosome  position  ref allele   variant allele   number of variant alleles of the individuals   depth of coverage   power to detect singletons at lod 3   number of variant bases seen   whether or not variant was called




git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2068 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
chartl 2009-11-18 15:47:52 +00:00
parent b68d6e06b7
commit be31d7f4cc
2 changed files with 241 additions and 0 deletions

View File

@ -0,0 +1,185 @@
package org.broadinstitute.sting.playground.gatk.walkers;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.RODRecordList;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.genotype.Genotype;
import org.broadinstitute.sting.utils.genotype.Variation;
import org.broadinstitute.sting.utils.genotype.VariantBackedByGenotype;
import org.broadinstitute.sting.playground.gatk.walkers.poolseq.PowerBelowFrequencyWalker;
import org.broadinstitute.sting.playground.gatk.walkers.varianteval.ConcordanceTruthTable;
import java.io.*;
import java.util.LinkedList;
import java.util.HashMap;
import java.util.List;
import java.util.ArrayList;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: Nov 12, 2009
* Time: 12:31:58 PM
* To change this template use File | Settings | File Templates.
*/
public class HapmapPoolAllelicInfoWalker extends LocusWalker<String, PrintWriter> {
@Argument(fullName="outputFile", shortName="of", doc="File to write to", required=true)
public String outputFileString = null;
@Argument(fullName="numIndividualsInPool", shortName="ps",doc="Pool size",required = true)
public int poolSize = -1;
@Argument(fullName="sampleNames", shortName="samples", doc="Sample name bindings", required=true)
public String sampleNameFile = null;
private PrintWriter output;
private String[] sampleNames = null;
private PowerBelowFrequencyWalker powerWalker = null;
private ConcordanceTruthTable ctt = null;
public void initialize() {
sampleNames = generateNameTableFromFile(sampleNameFile);
powerWalker = new PowerBelowFrequencyWalker();
powerWalker.initialize();
powerWalker.setPoolSize(poolSize);
ctt = new ConcordanceTruthTable(poolSize);
}
public PrintWriter reduceInit() {
try {
output = new PrintWriter(outputFileString);
} catch (FileNotFoundException e) {
throw new StingException("File "+outputFileString+" could not be opened.", e);
}
output.printf("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%n","Chrom","Pos","Ref","Var","Num_Alleles","Depth","Power","Support","Called");
//System.out.printf("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%n","Chrom","Pos","Ref","Var","Num_Alleles","Depth","Power","Support","Called");
return output;
}
public String map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
GenomeLoc loc = context.getLocation();
String chrom = loc.getContig();
long pos = loc.getStart();
char refBase = Character.toUpperCase(ref.getBase());
List<Pair<Genotype, Genotype>> chips = getChips(sampleNames, tracker);
Pair<Genotype,Integer> alleleFreqInfo = ctt.getPooledAlleleFrequency(chips,refBase);
char alternate;
if ( alleleFreqInfo.getFirst() != null && alleleFreqInfo.getFirst().isVariant(refBase)) {
//System.out.println(refBase + " " + alleleFreqInfo.getFirst().getBases());
alternate = getAlternateBase(alleleFreqInfo.getFirst(),refBase);
} else {
return null; // early return
}
int numVariantAllele = alleleFreqInfo.getSecond();
int depth = context.numReads();
double power = powerWalker.calculatePowerAtFrequency(context,numVariantAllele);
int called;
Variation call = (Variation) tracker.lookup("calls",null);
if ( call == null ) {
called = 0;
} else if ( call.isReference() ) {
called = 0;
} else {
called = 1;
}
ReadBackedPileup p = new ReadBackedPileup(ref.getBase(),context);
int support = p.getBasePileupAsCounts()[BaseUtils.simpleBaseToBaseIndex(alternate)];
// sanity check
if ( refBase == alternate ) {
if ( alleleFreqInfo.getFirst().isVariant(refBase) ) {
logger.warn("Called as a variant! Ref: "+ refBase +"Chip data: " + alleleFreqInfo.getFirst().getBases());
}
}
return String.format("%s\t%d\t%c\t%c\t%d\t%d\t%f\t%d\t%d",chrom,pos,refBase,alternate,numVariantAllele,depth,power,support,called);
}
public char getAlternateBase(Genotype g, char ref) {
char[] bases = g.getBases().toCharArray();
if ( Character.toUpperCase(bases[0]) != ref )
return bases[0];
else
return bases[1];
}
public PrintWriter reduce(String s, PrintWriter p) {
if ( s == null ) {
// do nothing
return p;
} else {
//System.out.printf("%s%n",s);
output.printf("%s%n",s);
return p;
}
}
public void onTraversalDone(PrintWriter p) {
output.close();
}
private List<Pair<Genotype,Genotype>> getChips(String[] rodNames, RefMetaDataTracker tracker) {
List<Pair<Genotype, Genotype>> chips = new ArrayList <Pair<Genotype,Genotype>>(rodNames.length);
for ( String name : rodNames ) {
RODRecordList<ReferenceOrderedDatum> rods = tracker.getTrackData(name, null);
Variation chip = (rods == null ? null : (Variation)rods.getRecords().get(0));
if ( chip != null ) {
// chips must be Genotypes
if ( !(chip instanceof VariantBackedByGenotype) )
throw new StingException("Failure: trying to analyze genotypes using non-genotype truth data");
chips.add(new Pair<Genotype,Genotype>(((VariantBackedByGenotype)chip).getCalledGenotype(),null));
}
}
return chips;
}
// private methods for reading in names from a file
private String[] generateNameTableFromFile(String file) {
BufferedReader reader;
try {
reader = new BufferedReader(new FileReader(file));
} catch( FileNotFoundException e) {
String errMsg = "Hapmap pool file at "+file+" was not found. Please check filepath.";
throw new StingException(errMsg, e);
}
LinkedList<String> nameList = new LinkedList<String>();
while(continueReading(reader)) {
String line = readLine(reader);
nameList.add(line);
}
return nameList.toArray(new String[nameList.size()]);
}
private boolean continueReading(BufferedReader reader) {
boolean continueReading;
try {
continueReading = reader.ready();
} catch(IOException e) {
continueReading = false;
}
return continueReading;
}
private String readLine(BufferedReader reader) {
String line;
try {
line = reader.readLine();
} catch( IOException e) {
String errMsg = "BufferedReader pointing to "+reader.toString()+" was declared ready but no line could be read from it.";
throw new StingException(errMsg,e);
}
return line;
}
}

View File

@ -0,0 +1,56 @@
package org.broadinstitute.sting.playground.gatk.walkers;
import org.broadinstitute.sting.WalkerTest;
import org.junit.Test;
import java.util.Arrays;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: Nov 18, 2009
* Time: 9:29:31 AM
* To change this template use File | Settings | File Templates.
*/
public class HapmapPoolAllelicInfoIntegrationTest extends WalkerTest {
@Test
public void testFHSPool3() {
String test_args = "-T HapmapPoolAllelicInfo -samples /humgen/gsa-scr1/GATK_Data/Validation_Data/FHS_pilot_pool3_samples.txt "
+ "-B /humgen/gsa-scr1/GATK_Data/Validation_Data/FHS_pilot_pool3_sample_paths.txt "
+ "-B calls,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/FHS_pilot_pool3_raw_calls.geli "
+ "-I /humgen/gsa-scr1/GATK_Data/Validation_Data/FHSP_pool3_test.bam "
+ "-R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -of %s "
+ "-ps 40 -L /humgen/gsa-scr1/GATK_Data/Validation_Data/FHS_test_intervals.interval_list";
String md5ForThisTest = "9e44a04ea574a7f76efbc3a1f8cc710a";
WalkerTestSpec spec = new WalkerTestSpec(test_args, 1, Arrays.asList(md5ForThisTest));
executeTest("Pool 3 of FHS Pilot on testbed intervals", spec);
}
@Test
public void testFHSPool3NoIntervals() {
String test_args = "-T HapmapPoolAllelicInfo -samples /humgen/gsa-scr1/GATK_Data/Validation_Data/FHS_pilot_pool3_samples.txt "
+ "-B /humgen/gsa-scr1/GATK_Data/Validation_Data/FHS_pilot_pool3_sample_paths.txt "
+ "-B calls,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/FHS_pilot_pool3_raw_calls.geli "
+ "-I /humgen/gsa-scr1/GATK_Data/Validation_Data/FHSP_pool3_test.bam "
+ "-R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -of %s "
+ "-ps 40";
String md5ForThisTest = "571c10a635a84d0bbd453897ef4fb3c6";
WalkerTestSpec spec = new WalkerTestSpec(test_args, 1, Arrays.asList(md5ForThisTest));
executeTest("Pool 3 of FHS Pilot without testbed intervals", spec);
}
@Test
public void testSmallPool() {
String test_args = "-T HapmapPoolAllelicInfo -samples /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878_NA12891_samples.txt "
+"-B NA12878,GFF,/humgen/gsa-scr1/andrewk/hapmap_1kg/gffs/NA12878.gff "
+"-B NA12891,GFF,/humgen/gsa-scr1/andrewk/hapmap_1kg/gffs/NA12891.gff "
+"-I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12891.a2b.recal.annotation_subset.bam "
+"-L chr1:14000000-18000000 -B calls,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12891.a2b.ssg1b.geli.calls "
+"-of %s -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -ps 2";
String md5ForThisTest = "46c6a22ab57c40453f29b99a57288e0e";
WalkerTestSpec spec = new WalkerTestSpec(test_args, 1, Arrays.asList(md5ForThisTest));
executeTest("Test on pool of two individuals -- CEU father+daughter chips against subset of the father's calls; power at lod 3", spec);
}
}