diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HapmapPoolAllelicInfoWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HapmapPoolAllelicInfoWalker.java new file mode 100755 index 000000000..229afd359 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HapmapPoolAllelicInfoWalker.java @@ -0,0 +1,185 @@ +package org.broadinstitute.sting.playground.gatk.walkers; + +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; +import org.broadinstitute.sting.gatk.refdata.RODRecordList; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.genotype.Genotype; +import org.broadinstitute.sting.utils.genotype.Variation; +import org.broadinstitute.sting.utils.genotype.VariantBackedByGenotype; +import org.broadinstitute.sting.playground.gatk.walkers.poolseq.PowerBelowFrequencyWalker; +import org.broadinstitute.sting.playground.gatk.walkers.varianteval.ConcordanceTruthTable; + +import java.io.*; +import java.util.LinkedList; +import java.util.HashMap; +import java.util.List; +import java.util.ArrayList; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: Nov 12, 2009 + * Time: 12:31:58 PM + * To change this template use File | Settings | File Templates. + */ +public class HapmapPoolAllelicInfoWalker extends LocusWalker { + @Argument(fullName="outputFile", shortName="of", doc="File to write to", required=true) + public String outputFileString = null; + @Argument(fullName="numIndividualsInPool", shortName="ps",doc="Pool size",required = true) + public int poolSize = -1; + @Argument(fullName="sampleNames", shortName="samples", doc="Sample name bindings", required=true) + public String sampleNameFile = null; + + private PrintWriter output; + + private String[] sampleNames = null; + private PowerBelowFrequencyWalker powerWalker = null; + private ConcordanceTruthTable ctt = null; + + public void initialize() { + sampleNames = generateNameTableFromFile(sampleNameFile); + powerWalker = new PowerBelowFrequencyWalker(); + powerWalker.initialize(); + powerWalker.setPoolSize(poolSize); + ctt = new ConcordanceTruthTable(poolSize); + } + + public PrintWriter reduceInit() { + try { + output = new PrintWriter(outputFileString); + } catch (FileNotFoundException e) { + throw new StingException("File "+outputFileString+" could not be opened.", e); + } + output.printf("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%n","Chrom","Pos","Ref","Var","Num_Alleles","Depth","Power","Support","Called"); + //System.out.printf("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%n","Chrom","Pos","Ref","Var","Num_Alleles","Depth","Power","Support","Called"); + return output; + } + + public String map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + GenomeLoc loc = context.getLocation(); + String chrom = loc.getContig(); + long pos = loc.getStart(); + char refBase = Character.toUpperCase(ref.getBase()); + List> chips = getChips(sampleNames, tracker); + Pair alleleFreqInfo = ctt.getPooledAlleleFrequency(chips,refBase); + char alternate; + if ( alleleFreqInfo.getFirst() != null && alleleFreqInfo.getFirst().isVariant(refBase)) { + //System.out.println(refBase + " " + alleleFreqInfo.getFirst().getBases()); + alternate = getAlternateBase(alleleFreqInfo.getFirst(),refBase); + + } else { + return null; // early return + } + int numVariantAllele = alleleFreqInfo.getSecond(); + int depth = context.numReads(); + double power = powerWalker.calculatePowerAtFrequency(context,numVariantAllele); + int called; + Variation call = (Variation) tracker.lookup("calls",null); + if ( call == null ) { + called = 0; + } else if ( call.isReference() ) { + called = 0; + } else { + called = 1; + } + + ReadBackedPileup p = new ReadBackedPileup(ref.getBase(),context); + int support = p.getBasePileupAsCounts()[BaseUtils.simpleBaseToBaseIndex(alternate)]; + + // sanity check + if ( refBase == alternate ) { + if ( alleleFreqInfo.getFirst().isVariant(refBase) ) { + logger.warn("Called as a variant! Ref: "+ refBase +"Chip data: " + alleleFreqInfo.getFirst().getBases()); + } + } + + return String.format("%s\t%d\t%c\t%c\t%d\t%d\t%f\t%d\t%d",chrom,pos,refBase,alternate,numVariantAllele,depth,power,support,called); + + } + + public char getAlternateBase(Genotype g, char ref) { + char[] bases = g.getBases().toCharArray(); + if ( Character.toUpperCase(bases[0]) != ref ) + return bases[0]; + else + return bases[1]; + } + + public PrintWriter reduce(String s, PrintWriter p) { + if ( s == null ) { + // do nothing + return p; + } else { + //System.out.printf("%s%n",s); + output.printf("%s%n",s); + return p; + } + } + + public void onTraversalDone(PrintWriter p) { + output.close(); + } + + private List> getChips(String[] rodNames, RefMetaDataTracker tracker) { + List> chips = new ArrayList >(rodNames.length); + for ( String name : rodNames ) { + RODRecordList rods = tracker.getTrackData(name, null); + Variation chip = (rods == null ? null : (Variation)rods.getRecords().get(0)); + if ( chip != null ) { + // chips must be Genotypes + if ( !(chip instanceof VariantBackedByGenotype) ) + throw new StingException("Failure: trying to analyze genotypes using non-genotype truth data"); + chips.add(new Pair(((VariantBackedByGenotype)chip).getCalledGenotype(),null)); + } + } + + return chips; + } + // private methods for reading in names from a file + + private String[] generateNameTableFromFile(String file) { + BufferedReader reader; + try { + reader = new BufferedReader(new FileReader(file)); + } catch( FileNotFoundException e) { + String errMsg = "Hapmap pool file at "+file+" was not found. Please check filepath."; + throw new StingException(errMsg, e); + } + + LinkedList nameList = new LinkedList(); + + while(continueReading(reader)) { + String line = readLine(reader); + nameList.add(line); + } + + return nameList.toArray(new String[nameList.size()]); + } + + private boolean continueReading(BufferedReader reader) { + boolean continueReading; + try { + continueReading = reader.ready(); + } catch(IOException e) { + continueReading = false; + } + return continueReading; + } + + private String readLine(BufferedReader reader) { + String line; + try { + line = reader.readLine(); + } catch( IOException e) { + String errMsg = "BufferedReader pointing to "+reader.toString()+" was declared ready but no line could be read from it."; + throw new StingException(errMsg,e); + } + return line; + } + +} diff --git a/java/test/org/broadinstitute/sting/playground/gatk/walkers/HapmapPoolAllelicInfoIntegrationTest.java b/java/test/org/broadinstitute/sting/playground/gatk/walkers/HapmapPoolAllelicInfoIntegrationTest.java new file mode 100755 index 000000000..0dba1aaea --- /dev/null +++ b/java/test/org/broadinstitute/sting/playground/gatk/walkers/HapmapPoolAllelicInfoIntegrationTest.java @@ -0,0 +1,56 @@ +package org.broadinstitute.sting.playground.gatk.walkers; + +import org.broadinstitute.sting.WalkerTest; +import org.junit.Test; + +import java.util.Arrays; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: Nov 18, 2009 + * Time: 9:29:31 AM + * To change this template use File | Settings | File Templates. + */ +public class HapmapPoolAllelicInfoIntegrationTest extends WalkerTest { + + @Test + public void testFHSPool3() { + String test_args = "-T HapmapPoolAllelicInfo -samples /humgen/gsa-scr1/GATK_Data/Validation_Data/FHS_pilot_pool3_samples.txt " + + "-B /humgen/gsa-scr1/GATK_Data/Validation_Data/FHS_pilot_pool3_sample_paths.txt " + + "-B calls,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/FHS_pilot_pool3_raw_calls.geli " + + "-I /humgen/gsa-scr1/GATK_Data/Validation_Data/FHSP_pool3_test.bam " + + "-R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -of %s " + + "-ps 40 -L /humgen/gsa-scr1/GATK_Data/Validation_Data/FHS_test_intervals.interval_list"; + String md5ForThisTest = "9e44a04ea574a7f76efbc3a1f8cc710a"; + WalkerTestSpec spec = new WalkerTestSpec(test_args, 1, Arrays.asList(md5ForThisTest)); + executeTest("Pool 3 of FHS Pilot on testbed intervals", spec); + } + + @Test + public void testFHSPool3NoIntervals() { + String test_args = "-T HapmapPoolAllelicInfo -samples /humgen/gsa-scr1/GATK_Data/Validation_Data/FHS_pilot_pool3_samples.txt " + + "-B /humgen/gsa-scr1/GATK_Data/Validation_Data/FHS_pilot_pool3_sample_paths.txt " + + "-B calls,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/FHS_pilot_pool3_raw_calls.geli " + + "-I /humgen/gsa-scr1/GATK_Data/Validation_Data/FHSP_pool3_test.bam " + + "-R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -of %s " + + "-ps 40"; + String md5ForThisTest = "571c10a635a84d0bbd453897ef4fb3c6"; + WalkerTestSpec spec = new WalkerTestSpec(test_args, 1, Arrays.asList(md5ForThisTest)); + executeTest("Pool 3 of FHS Pilot without testbed intervals", spec); + } + + @Test + public void testSmallPool() { + String test_args = "-T HapmapPoolAllelicInfo -samples /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878_NA12891_samples.txt " + +"-B NA12878,GFF,/humgen/gsa-scr1/andrewk/hapmap_1kg/gffs/NA12878.gff " + +"-B NA12891,GFF,/humgen/gsa-scr1/andrewk/hapmap_1kg/gffs/NA12891.gff " + +"-I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12891.a2b.recal.annotation_subset.bam " + +"-L chr1:14000000-18000000 -B calls,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12891.a2b.ssg1b.geli.calls " + +"-of %s -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -ps 2"; + String md5ForThisTest = "46c6a22ab57c40453f29b99a57288e0e"; + WalkerTestSpec spec = new WalkerTestSpec(test_args, 1, Arrays.asList(md5ForThisTest)); + executeTest("Test on pool of two individuals -- CEU father+daughter chips against subset of the father's calls; power at lod 3", spec); + + } +}