Minor changes to CoverageAndPowerWalker bootstrapping (faster selection of indeces).

Entirely new Aritifical Pool Walker (ArtificialPoolWalkerMk2), will likely replace ArtificialPoolWalker on the next commit. Adapted the method of sampling, and added a helper context class: ArtificialPoolContext which carries much of the burden of calculation and data handling for the walker. The walker itself maps and reduces ArtificialPoolContexts.

Cheers!






git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1461 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
chartl 2009-08-26 21:42:35 +00:00
parent 92ea947c33
commit 1da45cffb3
5 changed files with 450 additions and 35 deletions

View File

@ -29,6 +29,8 @@ public class AnalyzePowerWalker extends CoverageAndPowerWalker{
String pathToSyzygyFile = null;
@Argument(fullName = "ColumnOffset", shortName = "co", doc = "Offset of column containing the power in the pf", required = true)
int colOffset = 0;
@Argument(fullName = "linesToClear", shortName="clr", doc = "Clear so many lines from the read file before starting (default - just the header line)", required = false)
int clrLines = 1;
BufferedReader syzyFileReader;
final String pfFileDelimiter = " ";
@ -40,7 +42,9 @@ public class AnalyzePowerWalker extends CoverageAndPowerWalker{
super.initialize();
try {
syzyFileReader = new BufferedReader(new FileReader(pathToSyzygyFile));
syzyFileReader.readLine();
for(int clear = 0; clear < clrLines; clear++) {
syzyFileReader.readLine();
}
} catch (FileNotFoundException e) {
String newErrMsg = "Syzygy input file " + pathToSyzygyFile + " could be incorrect. File not found.";
throw new StingException(newErrMsg,e);
@ -69,8 +73,8 @@ public class AnalyzePowerWalker extends CoverageAndPowerWalker{
if(!syzyFileIsReady) {
throw new StingException("Input file reader was not ready before an attempt to read from it.");
} else if(!outOfLinesInSyzyFile) {
double syzyPow = getSyzyPowFromFile();
out.printf("%s: %d %d %f %f%n", context.getLocation(), context.getReads().size(),powpair.second,powpair.first,syzyPow);
Pair<Double,String> syzyPow = getSyzyPowFromFile();
out.printf("%s: %d %d %f %f (%s)%n", context.getLocation(), context.getReads().size(),powpair.second,powpair.first,syzyPow.first,syzyPow.second);
} else {
out.printf("%s: %d %d %f%n", context.getLocation(), context.getReads().size(),powpair.second,powpair.first);
}
@ -79,7 +83,7 @@ public class AnalyzePowerWalker extends CoverageAndPowerWalker{
return context.getReads().size();
}
public double getSyzyPowFromFile() {
public Pair<Double,String> getSyzyPowFromFile() {
String thisLine = null;
try {
thisLine = syzyFileReader.readLine();
@ -87,15 +91,17 @@ public class AnalyzePowerWalker extends CoverageAndPowerWalker{
String newErrMsg = "Ran out of lines in the syzyfile; further output of Syzygy power will be suppressed.";
outOfLinesInSyzyFile=true;
logger.warn(newErrMsg + " " + e.toString());
return -1.1;
return new Pair(-1.1, "Printing Stops Here");
}
String chromPos = null;
StringTokenizer lineTokenizer = new StringTokenizer(thisLine, pfFileDelimiter);
try {
for(int j = 0; j < colOffset; j++) {
chromPos = lineTokenizer.nextToken();
for(int j = 1; j < colOffset; j++) {
lineTokenizer.nextToken();
}
return (Double.valueOf(lineTokenizer.nextToken())/100.0);
return new Pair((Double.valueOf(lineTokenizer.nextToken())/100.0),chromPos);
} catch (NoSuchElementException e) {
String errMsg = "The given column offset for the pool, " + colOffset + " exceeded the number of entries in the file " + pathToSyzygyFile;
throw new StingException(errMsg);

View File

@ -0,0 +1,146 @@
package org.broadinstitute.sting.playground.gatk.walkers.poolseq;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.genotyper.SingleSampleGenotyper;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import net.sf.samtools.SAMFileWriter;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.playground.utils.ArtificialPoolContext;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.StingException;
import java.io.FileOutputStream;
import java.io.PrintWriter;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Set;
import java.util.List;
import java.util.ListIterator;
import java.util.LinkedList;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: Aug 26, 2009
* Time: 11:28:26 AM
* To change this template use File | Settings | File Templates.
*/
public class ArtificalPoolWalkerMk2 extends LocusWalker<ArtificialPoolContext, ArtificialPoolContext> {
@Argument(fullName = "AuxOutputFile", shortName = "af", doc = "Auxiliary file for genotyp & coverage output", required = true)
String auxFilePath = null;
@Argument(fullName = "OutputBamFile", shortName = "of", doc = "Output to this file rather than standard output", required = false)
SAMFileWriter outputBamFile = null;
public void initialize() {
}
public ArtificialPoolContext reduceInit() { // try to initialize the file writer
ArtificialPoolContext apContext = new ArtificialPoolContext();
apContext.setSingleSampleGenotyper(new SingleSampleGenotyper());
apContext.setReadGroupSets(getToolkit().getMergedReadGroupsByReaders());
apContext.setAuxWriter(initializeAuxFileWriter(apContext.getTotalNumberOfPeople()));
apContext.setSAMFileWriter(outputBamFile);
apContext.initializeSSG();
return apContext;
}
public ArtificialPoolContext map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
return new ArtificialPoolContext(tracker,ref,context);
}
public ArtificialPoolContext reduce(ArtificialPoolContext mapCon, ArtificialPoolContext redCon){
ArtificialPoolContext updatedContext = ArtificialPoolContext.mapReduceMerge(mapCon,redCon);
List<SAMRecord>[] newReads = updatedContext.splitReadsByGroup(updatedContext.getNewReads());
long[] newCvg = updateRunningCoverage(updatedContext.getRunningCoverage(), getCoverageByGroup(newReads));
updatedContext.setRunningCoverage(newCvg);
List<SAMRecord>[] sampledReads = ArtificialPoolContext.sampleReads(newReads,runningCoverageToDouble(newCvg));
printToFiles(sampledReads,updatedContext);
return updatedContext;
}
// Helper methods follow
private PrintWriter initializeAuxFileWriter(int nFiles) {
PrintWriter auxFileWriter;
try {
auxFileWriter = new PrintWriter(new FileOutputStream(auxFilePath));
auxFileWriter.println(createAuxFileHeader(nFiles));
} catch(FileNotFoundException e) {
String errmsg = "The filepath you entered "+auxFilePath+" could not be opened. Please double-check that the input is correct.";
throw new StingException(errmsg, e);
} catch(IOException e) {
String errmsg = "The file you entered "+auxFilePath+" could not be written to. Please check your permissions to write to this file.";
throw new StingException(errmsg,e);
}
return auxFileWriter;
}
private String createAuxFileHeader(int nFiles) {
String sp = " ";
String st1 = "Chrom:Pos" + sp;
String st2 = "";
for(int j = 0; j < nFiles; j++) {
st2 = st2 + "Pers " + j + " Gen" + sp; // short for "genotype of person j at this location"
st2 = st2 + "Pers " + j + " Conf" + sp; // short for "confidence in genotype call of ..."
st2 = st2 + "Pers " + j + " NewCvg" + sp; // short for "coverage of person j at this location"
}
String st3 = "TotalCvg";
return st1+st2+st3;
}
private int[] getCoverageByGroup(List<SAMRecord>[] readsByGroup) {
int[] coverage = new int[readsByGroup.length];
for(int iterator = 0; iterator < readsByGroup.length; iterator ++) {
coverage[iterator] = readsByGroup[iterator].size();
}
return coverage;
}
private long[] updateRunningCoverage(long[] cvgUpToNow, int[] newCvgByGroup) {
long[] newCvg = new long[cvgUpToNow.length];
for(int iter = 0; iter < cvgUpToNow.length; iter++) {
newCvg[iter] = cvgUpToNow[iter] + newCvgByGroup[iter];
}
return newCvg;
}
private double[] runningCoverageToDouble(long[] cvg) {
double[] avgProp = new double[cvg.length];
long sum = 0;
for(long elem : cvg) {
sum += elem;
}
for(int iter = 0; iter < cvg.length; iter++) {
avgProp[iter] = cvg[iter]/sum;
}
return avgProp;
}
private void printToFiles(List<SAMRecord>[] sampledNewReads, ArtificialPoolContext context) {
SAMFileWriter samWrite = context.getSAMFileWriter();
String sp = " ";
PrintWriter auxWrite = context.getWriterToAuxiliaryFile();
int readGroupInt = 0;
for(List<SAMRecord> readGroup : sampledNewReads) {
for(SAMRecord read : readGroup) {
samWrite.addAlignment(read);
}
auxWrite.print(context.getAlignmentContext().getLocation().toString() + sp);
auxWrite.print(context.genotypeAndConfidenceToString(readGroupInt,sp));
readGroupInt++;
}
}
}

View File

@ -54,6 +54,7 @@ public class ArtificialPoolWalker extends LocusWalker<List<SAMRecord>[], SAMFile
private LinkedList<Integer>[] living_reads;
private SingleSampleGenotyper ssg;
private int npeople;
//TODO: LOCAL CLASS FOR ALL THIS
//@param local_genotypes - holds the genotype (A A/ A C/ etc) for each individual. Updates at each locus.
//@param auxWrite - the writer to the auxiliary file
//@param readGroupSets : holds the readgroups (identifiers for individuals from each read)
@ -86,8 +87,6 @@ public class ArtificialPoolWalker extends LocusWalker<List<SAMRecord>[], SAMFile
if(red_prop <= 0) {
red_prop = 1.0/npeople;
} else {
// do nothing muhahaha
}
// initialize the local genotype array
@ -112,7 +111,7 @@ public class ArtificialPoolWalker extends LocusWalker<List<SAMRecord>[], SAMFile
updateLiving();
// each time we move to the next locus, remove from the coverage count those reads that ended
auxWrite.printf("%s:%s",context.getContig(),context.getPosition());
auxWrite.printf("%s:%s",context.getContig(),context.getPosition()); // TODO: PUT IN REDUCE
return getNewReadsAndGenotypesByGroup(tracker, ref, context);
}
@ -134,7 +133,6 @@ public class ArtificialPoolWalker extends LocusWalker<List<SAMRecord>[], SAMFile
List<SAMRecord>[] randomReadsByGroup = drawReadsRandomlyFromReadsByGroup(readsByReadGroup,sought_coverage);
printToFileAndAuxFile(randomReadsByGroup,sought_coverage,outFile);
return outFile;
}

View File

@ -32,15 +32,10 @@ public class CoverageAndPowerWalker extends LocusWalker<Integer, Pair<Long, Long
@Argument(fullName="lodThreshold", shortName="lt", doc="Threshold for LOD score for calls")
public double threshold = 3.0;
private static final int BOOTSTRAP_ITERATIONS = 300;
@Override
public void initialize()
{
public void initialize() {
if(num_individuals <= 0)
throw new IllegalArgumentException("Positive nonzero parameter expected for poolSize");
@ -117,9 +112,7 @@ public class CoverageAndPowerWalker extends LocusWalker<Integer, Pair<Long, Long
double pow = 0;
if(depth - kaccept < kaccept) {// kaccept > depth/2 - calculate power as P[hits between k and depth]
for(int k = kaccept; k < depth; k++) {
pow += MathUtils.binomialProbabilityLog(k, depth, snp_prop);
}
@ -153,14 +146,8 @@ public class CoverageAndPowerWalker extends LocusWalker<Integer, Pair<Long, Long
if (depth <= 0) {
result = new Pair(-1,-1);
} else if (!use_bootstrap) { // object data from command line
result = powerTheoretical(depth, reads, offsets, single_snip_proportion);
} else {
//
// otherwise, bootstrapping occurs below
//
} else { // otherwise, bootstrapping occurs below
int hypothesis_rejections=0;
for(int boot = 0; boot < BOOTSTRAP_ITERATIONS; boot++)
@ -196,15 +183,7 @@ public class CoverageAndPowerWalker extends LocusWalker<Integer, Pair<Long, Long
public int randomlySelectRead(int depth)
{
double qscore_selector = Math.random();
int readspositionrandom;
for(readspositionrandom = 1; readspositionrandom < ((double)depth * qscore_selector); readspositionrandom ++) {
if(readspositionrandom > depth + 1) {
throw new RuntimeException("qscore iterator exceeding possible thresholds");
}
}
return readspositionrandom - 1;
return (int) Math.floor((double)depth * Math.random());
}

View File

@ -0,0 +1,286 @@
package org.broadinstitute.sting.playground.utils;
import org.broadinstitute.sting.gatk.walkers.genotyper.SingleSampleGenotyper;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.utils.Pair;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.genotype.GenotypeCall;
import java.io.PrintWriter;
import java.util.Set;
import java.util.List;
import java.util.LinkedList;
import java.util.ArrayList;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMFileWriter;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: Aug 26, 2009
* Time: 11:37:42 AM
* To change this template use File | Settings | File Templates.
*/
public class ArtificialPoolContext {
private PrintWriter writerToAuxiliaryFile;
private SAMFileWriter writerToSamFile;
private SingleSampleGenotyper ssg;
private List<Set<String>> readGroupSets;
private long[] runningCoverage;
private RefMetaDataTracker refTracker;
private ReferenceContext refContext;
private AlignmentContext aliContext;
public ArtificialPoolContext() {
readGroupSets = null;
writerToAuxiliaryFile = null;
writerToSamFile = null;
ssg = null;
refTracker = null;
aliContext = null;
refContext = null;
runningCoverage = null;
}
public ArtificialPoolContext(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
refTracker = tracker;
refContext = ref;
aliContext = context;
readGroupSets = null;
writerToAuxiliaryFile = null;
writerToSamFile=null;
ssg = null;
runningCoverage = null;
}
public ArtificialPoolContext(PrintWriter pw, SAMFileWriter sw, SingleSampleGenotyper g, List<Set<String>> rgs, long [] runcvg, RefMetaDataTracker rt, ReferenceContext rc, AlignmentContext ac) {
writerToAuxiliaryFile = pw;
writerToSamFile = sw;
ssg = g;
readGroupSets = rgs;
runningCoverage = runcvg;
refTracker = rt;
refContext = rc;
aliContext = ac;
}
public void setAuxWriter(PrintWriter writer) {
writerToAuxiliaryFile = writer;
}
public void setSingleSampleGenotyper(SingleSampleGenotyper typer) {
ssg = typer;
}
public void initializeSSG() {
ssg.initialize();
}
public void setReadGroupSets(List<Set<String>> rgSets) {
readGroupSets = rgSets;
}
public void setRefMetaDataTracker(RefMetaDataTracker tracker) {
refTracker = tracker;
}
public void setReferenceContext(ReferenceContext ref) {
refContext = ref;
}
public void setAlignmentContext(AlignmentContext context) {
aliContext = context;
}
public void setRunningCoverage(long[] estimate) {
runningCoverage = estimate;
}
public void setSAMFileWriter(SAMFileWriter writer) {
writerToSamFile = writer;
}
public int getTotalNumberOfPeople() {
return readGroupSets.size();
}
public RefMetaDataTracker getRefMetaDataTracker() {
return refTracker;
}
public ReferenceContext getReferenceContext() {
return refContext;
}
public AlignmentContext getAlignmentContext() {
return aliContext;
}
public PrintWriter getWriterToAuxiliaryFile() {
return writerToAuxiliaryFile;
}
public SingleSampleGenotyper getSingleSampleGenotyper() {
return ssg;
}
public List<Set<String>> getReadGroupSets() {
return readGroupSets;
}
public long[] getRunningCoverage() {
return runningCoverage;
}
public SAMFileWriter getSAMFileWriter() {
return writerToSamFile;
}
public List<SAMRecord> getReads() {
List<SAMRecord> reads;
if(aliContext == null) {
reads=null;
} else {
reads = aliContext.getReads();
}
return reads;
}
public List<Integer> getOffsets() {
List<Integer> offsets;
if(aliContext == null) {
offsets = null;
} else {
offsets = aliContext.getOffsets();
}
return offsets;
}
public List<SAMRecord> getNewReads() {
List<SAMRecord> newReads;
if(aliContext == null) {
newReads = null;
} else {
newReads = new LinkedList<SAMRecord>();
List<SAMRecord> allReads = aliContext.getReads();
List<Integer> allOffsets = aliContext.getOffsets();
for(int iter = 0; iter < allReads.size(); iter++) {
if(allOffsets.get(iter) == 0) {
newReads.add(allReads.get(iter));
}
}
}
return newReads;
}
public Pair<List<SAMRecord>[],List<Integer>[]> splitByGroup(List<SAMRecord> unsplitReads, List<Integer> unsplitOffsets) {
List<SAMRecord>[] readsSplitByGroup;
List<Integer> [] offsetsSplitByGroup;
if(unsplitReads != null && readGroupSets != null) {
readsSplitByGroup = new ArrayList[this.getTotalNumberOfPeople()];
if(unsplitOffsets != null) {
offsetsSplitByGroup = new ArrayList[this.getTotalNumberOfPeople()];
}
else {
offsetsSplitByGroup = null;
}
int listSize = unsplitReads.size();
for(int element = 0; element < listSize; element++) {
SAMRecord read = unsplitReads.get(element);
for(int groupNumber = 0; groupNumber < this.getTotalNumberOfPeople(); groupNumber++) {
if(readGroupSets.get(groupNumber).contains((String) read.getAttribute("RG"))) {
readsSplitByGroup[groupNumber].add(read);
if(offsetsSplitByGroup != null) {
offsetsSplitByGroup[groupNumber].add(unsplitOffsets.get(element));
}
break;
}
}
}
} else {
readsSplitByGroup = null;
offsetsSplitByGroup = null; // compiler complains without these lines
}
return new Pair(readsSplitByGroup,offsetsSplitByGroup);
}
public List<SAMRecord>[] splitReadsByGroup(List<SAMRecord> unsplitReads) {
return (this.splitByGroup(unsplitReads,null)).first;
}
// Static methods follow
public static ArtificialPoolContext mapReduceMerge(ArtificialPoolContext mapContext, ArtificialPoolContext reduceContext) {
return new ArtificialPoolContext(reduceContext.getWriterToAuxiliaryFile(),reduceContext.getSAMFileWriter(),
reduceContext.getSingleSampleGenotyper(), reduceContext.getReadGroupSets(), reduceContext.getRunningCoverage(),
mapContext.getRefMetaDataTracker(),mapContext.getReferenceContext(),mapContext.getAlignmentContext());
}
public static Pair<List<SAMRecord>[],List<Integer>> sampleReadsAndOffsets(List<SAMRecord>[] reads, List<Integer>[] offsets, double[] propEstGlobal) {
double[] samplingRate = calculateSamplingRateFromGlobalEstimate(propEstGlobal);
List<SAMRecord>[] sampledReads = new ArrayList[reads.length];
List<Integer>[] sampledOffsets;
if(offsets != null){
sampledOffsets = new ArrayList[offsets.length];
} else {
sampledOffsets = null;
}
for(int group = 0; group < reads.length; group++) {
for(int readNumber = 0; readNumber < reads[group].size(); readNumber++) {
if(Math.random() < samplingRate[group]) {
sampledReads[group].add(reads[group].get(readNumber));
if(sampledOffsets != null) {
sampledOffsets[group].add(offsets[group].get(readNumber));
}
}
}
}
return new Pair(sampledReads,sampledOffsets);
}
public String genotypeAndConfidenceToString(int group, String spacer) {
GenotypeCall call = this.getGenotypeCall(group);
return (call.getGenotypes() + spacer + call.getConfidenceScore().toString());
}
public GenotypeCall getGenotypeCall(int group) {
AlignmentContext alicon = this.getAlignmentContext();
Pair<List<SAMRecord>[],List<Integer>[]> byGroupSplitPair = this.splitByGroup(alicon.getReads(),alicon.getOffsets());
return ssg.map(this.getRefMetaDataTracker(),this.getReferenceContext(),
new AlignmentContext(this.getAlignmentContext().getLocation(), byGroupSplitPair.first[group],byGroupSplitPair.second[group]));
}
public static List<SAMRecord>[] sampleReads(List<SAMRecord>[] reads, double[] propEstGlobal) {
return (sampleReadsAndOffsets(reads, null, propEstGlobal)).first;
}
public static double[] calculateSamplingRateFromGlobalEstimate(double[] ratios) {
double min = ratios[0];
for(double ratio : ratios) {
if(ratio < min) {
min = ratio;
}
}
double[] samplingRate = new double[ratios.length];
// now divide by minimum
for(int j = 0; j < ratios.length; j++) {
samplingRate[j] = ratios[j]/min;
}
return samplingRate;
}
}