diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/AnalyzePowerWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/AnalyzePowerWalker.java index e9a57a6a9..79db22fdd 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/AnalyzePowerWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/AnalyzePowerWalker.java @@ -29,6 +29,8 @@ public class AnalyzePowerWalker extends CoverageAndPowerWalker{ String pathToSyzygyFile = null; @Argument(fullName = "ColumnOffset", shortName = "co", doc = "Offset of column containing the power in the pf", required = true) int colOffset = 0; + @Argument(fullName = "linesToClear", shortName="clr", doc = "Clear so many lines from the read file before starting (default - just the header line)", required = false) + int clrLines = 1; BufferedReader syzyFileReader; final String pfFileDelimiter = " "; @@ -40,7 +42,9 @@ public class AnalyzePowerWalker extends CoverageAndPowerWalker{ super.initialize(); try { syzyFileReader = new BufferedReader(new FileReader(pathToSyzygyFile)); - syzyFileReader.readLine(); + for(int clear = 0; clear < clrLines; clear++) { + syzyFileReader.readLine(); + } } catch (FileNotFoundException e) { String newErrMsg = "Syzygy input file " + pathToSyzygyFile + " could be incorrect. File not found."; throw new StingException(newErrMsg,e); @@ -69,8 +73,8 @@ public class AnalyzePowerWalker extends CoverageAndPowerWalker{ if(!syzyFileIsReady) { throw new StingException("Input file reader was not ready before an attempt to read from it."); } else if(!outOfLinesInSyzyFile) { - double syzyPow = getSyzyPowFromFile(); - out.printf("%s: %d %d %f %f%n", context.getLocation(), context.getReads().size(),powpair.second,powpair.first,syzyPow); + Pair syzyPow = getSyzyPowFromFile(); + out.printf("%s: %d %d %f %f (%s)%n", context.getLocation(), context.getReads().size(),powpair.second,powpair.first,syzyPow.first,syzyPow.second); } else { out.printf("%s: %d %d %f%n", context.getLocation(), context.getReads().size(),powpair.second,powpair.first); } @@ -79,7 +83,7 @@ public class AnalyzePowerWalker extends CoverageAndPowerWalker{ return context.getReads().size(); } - public double getSyzyPowFromFile() { + public Pair getSyzyPowFromFile() { String thisLine = null; try { thisLine = syzyFileReader.readLine(); @@ -87,15 +91,17 @@ public class AnalyzePowerWalker extends CoverageAndPowerWalker{ String newErrMsg = "Ran out of lines in the syzyfile; further output of Syzygy power will be suppressed."; outOfLinesInSyzyFile=true; logger.warn(newErrMsg + " " + e.toString()); - return -1.1; + return new Pair(-1.1, "Printing Stops Here"); } + String chromPos = null; StringTokenizer lineTokenizer = new StringTokenizer(thisLine, pfFileDelimiter); try { - for(int j = 0; j < colOffset; j++) { + chromPos = lineTokenizer.nextToken(); + for(int j = 1; j < colOffset; j++) { lineTokenizer.nextToken(); } - return (Double.valueOf(lineTokenizer.nextToken())/100.0); + return new Pair((Double.valueOf(lineTokenizer.nextToken())/100.0),chromPos); } catch (NoSuchElementException e) { String errMsg = "The given column offset for the pool, " + colOffset + " exceeded the number of entries in the file " + pathToSyzygyFile; throw new StingException(errMsg); diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/ArtificalPoolWalkerMk2.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/ArtificalPoolWalkerMk2.java new file mode 100755 index 000000000..cf6c67a09 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/ArtificalPoolWalkerMk2.java @@ -0,0 +1,146 @@ +package org.broadinstitute.sting.playground.gatk.walkers.poolseq; + +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.walkers.genotyper.SingleSampleGenotyper; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import net.sf.samtools.SAMFileWriter; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.playground.utils.ArtificialPoolContext; +import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.utils.StingException; + +import java.io.FileOutputStream; +import java.io.PrintWriter; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.Set; +import java.util.List; +import java.util.ListIterator; +import java.util.LinkedList; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: Aug 26, 2009 + * Time: 11:28:26 AM + * To change this template use File | Settings | File Templates. + */ +public class ArtificalPoolWalkerMk2 extends LocusWalker { + + @Argument(fullName = "AuxOutputFile", shortName = "af", doc = "Auxiliary file for genotyp & coverage output", required = true) + String auxFilePath = null; + @Argument(fullName = "OutputBamFile", shortName = "of", doc = "Output to this file rather than standard output", required = false) + SAMFileWriter outputBamFile = null; + + public void initialize() { + } + + public ArtificialPoolContext reduceInit() { // try to initialize the file writer + ArtificialPoolContext apContext = new ArtificialPoolContext(); + apContext.setSingleSampleGenotyper(new SingleSampleGenotyper()); + apContext.setReadGroupSets(getToolkit().getMergedReadGroupsByReaders()); + apContext.setAuxWriter(initializeAuxFileWriter(apContext.getTotalNumberOfPeople())); + apContext.setSAMFileWriter(outputBamFile); + apContext.initializeSSG(); + + return apContext; + } + + public ArtificialPoolContext map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + return new ArtificialPoolContext(tracker,ref,context); + } + + public ArtificialPoolContext reduce(ArtificialPoolContext mapCon, ArtificialPoolContext redCon){ + ArtificialPoolContext updatedContext = ArtificialPoolContext.mapReduceMerge(mapCon,redCon); + List[] newReads = updatedContext.splitReadsByGroup(updatedContext.getNewReads()); + long[] newCvg = updateRunningCoverage(updatedContext.getRunningCoverage(), getCoverageByGroup(newReads)); + updatedContext.setRunningCoverage(newCvg); + List[] sampledReads = ArtificialPoolContext.sampleReads(newReads,runningCoverageToDouble(newCvg)); + printToFiles(sampledReads,updatedContext); + return updatedContext; + } + + // Helper methods follow + + private PrintWriter initializeAuxFileWriter(int nFiles) { + PrintWriter auxFileWriter; + try { + auxFileWriter = new PrintWriter(new FileOutputStream(auxFilePath)); + auxFileWriter.println(createAuxFileHeader(nFiles)); + } catch(FileNotFoundException e) { + String errmsg = "The filepath you entered "+auxFilePath+" could not be opened. Please double-check that the input is correct."; + throw new StingException(errmsg, e); + } catch(IOException e) { + String errmsg = "The file you entered "+auxFilePath+" could not be written to. Please check your permissions to write to this file."; + throw new StingException(errmsg,e); + } + + return auxFileWriter; + } + + private String createAuxFileHeader(int nFiles) { + String sp = " "; + String st1 = "Chrom:Pos" + sp; + String st2 = ""; + for(int j = 0; j < nFiles; j++) { + st2 = st2 + "Pers " + j + " Gen" + sp; // short for "genotype of person j at this location" + st2 = st2 + "Pers " + j + " Conf" + sp; // short for "confidence in genotype call of ..." + st2 = st2 + "Pers " + j + " NewCvg" + sp; // short for "coverage of person j at this location" + } + String st3 = "TotalCvg"; + + return st1+st2+st3; + } + + private int[] getCoverageByGroup(List[] readsByGroup) { + int[] coverage = new int[readsByGroup.length]; + for(int iterator = 0; iterator < readsByGroup.length; iterator ++) { + coverage[iterator] = readsByGroup[iterator].size(); + } + + return coverage; + } + + private long[] updateRunningCoverage(long[] cvgUpToNow, int[] newCvgByGroup) { + long[] newCvg = new long[cvgUpToNow.length]; + for(int iter = 0; iter < cvgUpToNow.length; iter++) { + newCvg[iter] = cvgUpToNow[iter] + newCvgByGroup[iter]; + } + + return newCvg; + } + + private double[] runningCoverageToDouble(long[] cvg) { + double[] avgProp = new double[cvg.length]; + long sum = 0; + for(long elem : cvg) { + sum += elem; + } + for(int iter = 0; iter < cvg.length; iter++) { + avgProp[iter] = cvg[iter]/sum; + } + + return avgProp; + } + + private void printToFiles(List[] sampledNewReads, ArtificialPoolContext context) { + SAMFileWriter samWrite = context.getSAMFileWriter(); + String sp = " "; + PrintWriter auxWrite = context.getWriterToAuxiliaryFile(); + int readGroupInt = 0; + for(List readGroup : sampledNewReads) { + for(SAMRecord read : readGroup) { + samWrite.addAlignment(read); + } + auxWrite.print(context.getAlignmentContext().getLocation().toString() + sp); + auxWrite.print(context.genotypeAndConfidenceToString(readGroupInt,sp)); + readGroupInt++; + } + + } + + +} + diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/ArtificialPoolWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/ArtificialPoolWalker.java index ebfbc2d44..7298a8673 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/ArtificialPoolWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/ArtificialPoolWalker.java @@ -54,6 +54,7 @@ public class ArtificialPoolWalker extends LocusWalker[], SAMFile private LinkedList[] living_reads; private SingleSampleGenotyper ssg; private int npeople; + //TODO: LOCAL CLASS FOR ALL THIS //@param local_genotypes - holds the genotype (A A/ A C/ etc) for each individual. Updates at each locus. //@param auxWrite - the writer to the auxiliary file //@param readGroupSets : holds the readgroups (identifiers for individuals from each read) @@ -86,8 +87,6 @@ public class ArtificialPoolWalker extends LocusWalker[], SAMFile if(red_prop <= 0) { red_prop = 1.0/npeople; - } else { - // do nothing muhahaha } // initialize the local genotype array @@ -112,7 +111,7 @@ public class ArtificialPoolWalker extends LocusWalker[], SAMFile updateLiving(); // each time we move to the next locus, remove from the coverage count those reads that ended - auxWrite.printf("%s:%s",context.getContig(),context.getPosition()); + auxWrite.printf("%s:%s",context.getContig(),context.getPosition()); // TODO: PUT IN REDUCE return getNewReadsAndGenotypesByGroup(tracker, ref, context); } @@ -134,7 +133,6 @@ public class ArtificialPoolWalker extends LocusWalker[], SAMFile List[] randomReadsByGroup = drawReadsRandomlyFromReadsByGroup(readsByReadGroup,sought_coverage); printToFileAndAuxFile(randomReadsByGroup,sought_coverage,outFile); return outFile; - } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/CoverageAndPowerWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/CoverageAndPowerWalker.java index 61db70583..e21033ba4 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/CoverageAndPowerWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/CoverageAndPowerWalker.java @@ -32,15 +32,10 @@ public class CoverageAndPowerWalker extends LocusWalker depth/2 - calculate power as P[hits between k and depth] - for(int k = kaccept; k < depth; k++) { - pow += MathUtils.binomialProbabilityLog(k, depth, snp_prop); } @@ -153,14 +146,8 @@ public class CoverageAndPowerWalker extends LocusWalker depth + 1) { - throw new RuntimeException("qscore iterator exceeding possible thresholds"); - } - } - - return readspositionrandom - 1; + return (int) Math.floor((double)depth * Math.random()); } diff --git a/java/src/org/broadinstitute/sting/playground/utils/ArtificialPoolContext.java b/java/src/org/broadinstitute/sting/playground/utils/ArtificialPoolContext.java new file mode 100755 index 000000000..bca475a1d --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/utils/ArtificialPoolContext.java @@ -0,0 +1,286 @@ +package org.broadinstitute.sting.playground.utils; + +import org.broadinstitute.sting.gatk.walkers.genotyper.SingleSampleGenotyper; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.utils.Pair; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.genotype.GenotypeCall; + +import java.io.PrintWriter; +import java.util.Set; +import java.util.List; +import java.util.LinkedList; +import java.util.ArrayList; + +import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMFileWriter; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: Aug 26, 2009 + * Time: 11:37:42 AM + * To change this template use File | Settings | File Templates. + */ +public class ArtificialPoolContext { + private PrintWriter writerToAuxiliaryFile; + private SAMFileWriter writerToSamFile; + private SingleSampleGenotyper ssg; + private List> readGroupSets; + private long[] runningCoverage; + private RefMetaDataTracker refTracker; + private ReferenceContext refContext; + private AlignmentContext aliContext; + + public ArtificialPoolContext() { + readGroupSets = null; + writerToAuxiliaryFile = null; + writerToSamFile = null; + ssg = null; + refTracker = null; + aliContext = null; + refContext = null; + runningCoverage = null; + } + + public ArtificialPoolContext(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + refTracker = tracker; + refContext = ref; + aliContext = context; + readGroupSets = null; + writerToAuxiliaryFile = null; + writerToSamFile=null; + ssg = null; + runningCoverage = null; + } + + public ArtificialPoolContext(PrintWriter pw, SAMFileWriter sw, SingleSampleGenotyper g, List> rgs, long [] runcvg, RefMetaDataTracker rt, ReferenceContext rc, AlignmentContext ac) { + writerToAuxiliaryFile = pw; + writerToSamFile = sw; + ssg = g; + readGroupSets = rgs; + runningCoverage = runcvg; + refTracker = rt; + refContext = rc; + aliContext = ac; + } + + public void setAuxWriter(PrintWriter writer) { + writerToAuxiliaryFile = writer; + } + + public void setSingleSampleGenotyper(SingleSampleGenotyper typer) { + ssg = typer; + } + + public void initializeSSG() { + ssg.initialize(); + } + + public void setReadGroupSets(List> rgSets) { + readGroupSets = rgSets; + } + + public void setRefMetaDataTracker(RefMetaDataTracker tracker) { + refTracker = tracker; + } + + public void setReferenceContext(ReferenceContext ref) { + refContext = ref; + } + + public void setAlignmentContext(AlignmentContext context) { + aliContext = context; + } + + public void setRunningCoverage(long[] estimate) { + runningCoverage = estimate; + } + + public void setSAMFileWriter(SAMFileWriter writer) { + writerToSamFile = writer; + } + + public int getTotalNumberOfPeople() { + return readGroupSets.size(); + } + + public RefMetaDataTracker getRefMetaDataTracker() { + return refTracker; + } + + public ReferenceContext getReferenceContext() { + return refContext; + } + + public AlignmentContext getAlignmentContext() { + return aliContext; + } + + public PrintWriter getWriterToAuxiliaryFile() { + return writerToAuxiliaryFile; + } + + public SingleSampleGenotyper getSingleSampleGenotyper() { + return ssg; + } + + public List> getReadGroupSets() { + return readGroupSets; + } + + public long[] getRunningCoverage() { + return runningCoverage; + } + + public SAMFileWriter getSAMFileWriter() { + return writerToSamFile; + } + + public List getReads() { + List reads; + if(aliContext == null) { + reads=null; + } else { + reads = aliContext.getReads(); + } + + return reads; + } + + public List getOffsets() { + List offsets; + if(aliContext == null) { + offsets = null; + } else { + offsets = aliContext.getOffsets(); + } + + return offsets; + } + + public List getNewReads() { + List newReads; + + if(aliContext == null) { + newReads = null; + } else { + newReads = new LinkedList(); + List allReads = aliContext.getReads(); + List allOffsets = aliContext.getOffsets(); + + for(int iter = 0; iter < allReads.size(); iter++) { + if(allOffsets.get(iter) == 0) { + newReads.add(allReads.get(iter)); + } + } + } + + return newReads; + } + + public Pair[],List[]> splitByGroup(List unsplitReads, List unsplitOffsets) { + List[] readsSplitByGroup; + List [] offsetsSplitByGroup; + + if(unsplitReads != null && readGroupSets != null) { + readsSplitByGroup = new ArrayList[this.getTotalNumberOfPeople()]; + if(unsplitOffsets != null) { + offsetsSplitByGroup = new ArrayList[this.getTotalNumberOfPeople()]; + } + else { + offsetsSplitByGroup = null; + } + int listSize = unsplitReads.size(); + for(int element = 0; element < listSize; element++) { + SAMRecord read = unsplitReads.get(element); + for(int groupNumber = 0; groupNumber < this.getTotalNumberOfPeople(); groupNumber++) { + if(readGroupSets.get(groupNumber).contains((String) read.getAttribute("RG"))) { + readsSplitByGroup[groupNumber].add(read); + if(offsetsSplitByGroup != null) { + offsetsSplitByGroup[groupNumber].add(unsplitOffsets.get(element)); + } + break; + } + } + } + } else { + readsSplitByGroup = null; + offsetsSplitByGroup = null; // compiler complains without these lines + } + + return new Pair(readsSplitByGroup,offsetsSplitByGroup); + } + + public List[] splitReadsByGroup(List unsplitReads) { + return (this.splitByGroup(unsplitReads,null)).first; + } + + + // Static methods follow + + public static ArtificialPoolContext mapReduceMerge(ArtificialPoolContext mapContext, ArtificialPoolContext reduceContext) { + return new ArtificialPoolContext(reduceContext.getWriterToAuxiliaryFile(),reduceContext.getSAMFileWriter(), + reduceContext.getSingleSampleGenotyper(), reduceContext.getReadGroupSets(), reduceContext.getRunningCoverage(), + mapContext.getRefMetaDataTracker(),mapContext.getReferenceContext(),mapContext.getAlignmentContext()); + } + + public static Pair[],List> sampleReadsAndOffsets(List[] reads, List[] offsets, double[] propEstGlobal) { + double[] samplingRate = calculateSamplingRateFromGlobalEstimate(propEstGlobal); + List[] sampledReads = new ArrayList[reads.length]; + List[] sampledOffsets; + if(offsets != null){ + sampledOffsets = new ArrayList[offsets.length]; + } else { + sampledOffsets = null; + } + + for(int group = 0; group < reads.length; group++) { + for(int readNumber = 0; readNumber < reads[group].size(); readNumber++) { + if(Math.random() < samplingRate[group]) { + sampledReads[group].add(reads[group].get(readNumber)); + if(sampledOffsets != null) { + sampledOffsets[group].add(offsets[group].get(readNumber)); + } + } + } + } + + return new Pair(sampledReads,sampledOffsets); + } + + public String genotypeAndConfidenceToString(int group, String spacer) { + GenotypeCall call = this.getGenotypeCall(group); + return (call.getGenotypes() + spacer + call.getConfidenceScore().toString()); + } + + public GenotypeCall getGenotypeCall(int group) { + AlignmentContext alicon = this.getAlignmentContext(); + Pair[],List[]> byGroupSplitPair = this.splitByGroup(alicon.getReads(),alicon.getOffsets()); + return ssg.map(this.getRefMetaDataTracker(),this.getReferenceContext(), + new AlignmentContext(this.getAlignmentContext().getLocation(), byGroupSplitPair.first[group],byGroupSplitPair.second[group])); + } + + public static List[] sampleReads(List[] reads, double[] propEstGlobal) { + return (sampleReadsAndOffsets(reads, null, propEstGlobal)).first; + } + + public static double[] calculateSamplingRateFromGlobalEstimate(double[] ratios) { + double min = ratios[0]; + for(double ratio : ratios) { + if(ratio < min) { + min = ratio; + } + } + double[] samplingRate = new double[ratios.length]; + // now divide by minimum + for(int j = 0; j < ratios.length; j++) { + samplingRate[j] = ratios[j]/min; + } + + return samplingRate; + } + +}