diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/LocusDepthProportionWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/LocusDepthProportionWalker.java new file mode 100755 index 000000000..22a2efc34 --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/LocusDepthProportionWalker.java @@ -0,0 +1,84 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.newassociation; + +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.datasources.sample.Sample; +import org.broadinstitute.sting.gatk.executive.TreeReducer; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.walkers.TreeReducible; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.pileup.PileupElement; + +import java.io.PrintStream; +import java.util.*; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: 6/14/11 + * Time: 10:14 AM + * To change this template use File | Settings | File Templates. + */ +public class LocusDepthProportionWalker extends LocusWalker implements TreeReducible { + + @Output + PrintStream out; + + private Map samOrder; + + public void initialize() { + samOrder = new HashMap(getToolkit().getSAMFileSamples().size()); + int idx = 0; + out.printf("pos"); + for ( Sample s : getToolkit().getSAMFileSamples() ) { + out.printf("\t"); + out.printf(s.getId()); + samOrder.put(s.getId(),idx++); + } + out.printf("\t%s%n","total"); + } + + public Boolean reduceInit() { return null; } + + public double[] map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + + if ( ref == null || ! context.hasBasePileup() || ! context.hasReads() ) { return null; } + + out.print(ref.getLocus()); + double[] props = new double[1+samOrder.size()]; + + // one pass this + int nReads = context.size(); + + for ( PileupElement e : context.getBasePileup() ) { + props[samOrder.get(e.getRead().getReadGroup().getSample())] += 1; + } + + for ( int idx = 0; idx < props.length -1 ; idx ++ ) { + props[idx] /= nReads; + } + + props[props.length-1] = nReads; + + return props; + } + + public Boolean reduce(double[] map, Boolean pr) { + if ( map == null ) { return null; } + + StringBuffer buf = new StringBuffer(); + for ( double d : map ) { + buf.append("\t"); + buf.append(String.format("%.4f",d)); + } + + out.printf("%s%n",buf.toString()); + + return null; + } + + public Boolean treeReduce(Boolean a, Boolean b) { return null; } +} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/RFAArgumentCollection.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/RFAArgumentCollection.java new file mode 100755 index 000000000..8256c3cdb --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/RFAArgumentCollection.java @@ -0,0 +1,54 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.newassociation; + +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.ArgumentCollection; + +import java.io.File; +import java.util.List; + +/** + * Argument collection for the read feature associator and related walkers + */ +public class RFAArgumentCollection { + + @Argument(doc="ReadFeatures you want to test. None specified = all will be tested.",required=false,shortName="f",fullName="Feature") + public List inputFeatures = null; + + @Argument(doc="Size of window on which to perform tests",required=false,fullName="windowSize") + public int windowSize = 50; + + @Argument(doc="Size of the jump between tested windows",required=false,fullName="windowJump") + public int windowJump = 10; + + @Argument(doc="File containing a list of case samples",required=false,shortName="case",fullName="case") + public File caseFile = null; + + @Argument(doc="File containing a list of control samples",required=false,shortName="control",fullName="control") + public File controlFile = null; + + @Argument(doc="Fixed significance level, as a Z-score",required=false,shortName="z",fullName="fixedZ") + public double fixedZ = 6.0; + + @Argument(doc="Insert size below which to flag a read as aberrant",required=false,shortName="LIS",fullName="LowInsertSize") + public int lowInsertSize = 50; + + @Argument(doc="Insert size above which to flag a read as aberrant",required=false,shortName="HIS",fullName="HighInsertSize") + public int highInsertSize = 450; + + @Argument(doc="Significance level for determining whether a sample contains a significant proportion of affected reads",required=false,shortName="sz",fullName="perSampleZ") + public double sampleZThresh = 3.0; + + @Argument(doc="Lower bound for significant proportion of affected reads",shortName="se",fullName="sampleEpsilon",required=false) + public double EPSILON = 0.05; + + @Argument(doc="Number of clipped bases for a read to be considered \"split\"",required=false,shortName="cb",fullName="clippedBases") + public short clippedBases = 4; + + /* todo -- these for a possible "split read" binarification of clipped bases -- if there's a fast way to traverse clipped bases + @Argument(doc="Minimum base quality for a clipped base to be indicative of a split read",required=false,shortName="CLBQ",fullName="CLBQ") + public short clbq = 10; + + @Argument(doc="Minimum base quality sum for clipped bases (as a string) to be indicative of a split read",required=false,shortName="CLBS",fullName="CLBS") + public short clbs=50; + */ +} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/RFAArgumentCollection.java~ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/RFAArgumentCollection.java~ new file mode 100755 index 000000000..b313651dd --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/RFAArgumentCollection.java~ @@ -0,0 +1,54 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.newassociation; + +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.ArgumentCollection; + +import java.io.File; +import java.util.List; + +/** + * Argument collection for the read feature associator and related walkers + */ +public class RFAArgumentCollection { + + @Argument(doc="ReadFeatures you want to test. None specified = all will be tested.",required=false,shortName="f",fullName="Feature") + public List inputFeatures = null; + + @Argument(doc="Size of window on which to perform tests",required=false,fullName="windowSize") + public int windowSize = 50; + + @Argument(doc="Size of the jump between tested windows",required=false,fullName="windowJump") + public int windowJump = 10; + + @Argument(doc="File containing a list of case samples",required=false,shortName="case",fullName="case") + public File caseFile = null; + + @Argument(doc="File containing a list of control samples",required=false,shortName="control",fullName="control") + public File controlFile = null; + + @Argument(doc="Fixed significance level, as a Z-score",required=false,shortName="z",fullName="fixedZ") + public double fixedZ = 6.0; + + @Argument(doc="Insert size below which to flag a read as aberrant",required=false,shortName="LIS",fullName="LowInsertSize") + public int lowInsertSize = 50; + + @Argument(doc="Insert size above which to flag a read as aberrant",required=false,shortName="HIS",fullName="HighInsertSize") + public int highInsertSize = 450; + + @Argument(doc="Significance level for determining whether a sample contains a significant proportion of affected reads",required=false,shortName="sz",fullName="perSampleZ") + public double sampleZThresh = 3.0; + + @Argument(doc="Lower bound for significant proportion of affected reads",shortName="se",fullName="sampleEpsilon",required=false) + public double EPSILON = 0.05; + + @Argument(doc="Number of clipped bases for a read to be considered \"split\"",required=false,shortName="cb",fullName="clippedBases") + public short clippedBases = 5; + + /* todo -- these for a possible "split read" binarification of clipped bases -- if there's a fast way to traverse clipped bases + @Argument(doc="Minimum base quality for a clipped base to be indicative of a split read",required=false,shortName="CLBQ",fullName="CLBQ") + public short clbq = 10; + + @Argument(doc="Minimum base quality sum for clipped bases (as a string) to be indicative of a split read",required=false,shortName="CLBS",fullName="CLBS") + public short clbs=50; + */ +} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/RFAWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/RFAWalker.java new file mode 100755 index 000000000..e8262a33f --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/RFAWalker.java @@ -0,0 +1,335 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.newassociation; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.commandline.ArgumentCollection; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.datasources.sample.Sample; +import org.broadinstitute.sting.gatk.filters.*; +import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.ReadFilters; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features.*; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.classloader.PluginManager; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.StingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.text.XReadLines; + +import java.io.FileNotFoundException; +import java.io.PrintStream; +import java.util.*; + +/** + * Read feature association walker -- associates read features between dichotomized, or multi-group cohorts + * todo -- need a heuristic to stop doing tests where there is certainly no signal + * todo -- for most features there's a nuisance variable which is the proportion of *paired* reads, perhaps a pair-only setting for read features + */ + +@ReadFilters({MaxInsertSizeFilter.class,MappingQualityReadFilter.class,DuplicateReadFilter.class,FailsVendorQualityCheckReadFilter.class,NotPrimaryAlignmentReadFilter.class,UnmappedReadFilter.class}) +public class RFAWalker extends ReadWalker { + // todo -- this needs to be an argument collection that can get passed around to initialize read features etc + @ArgumentCollection + private RFAArgumentCollection collection = new RFAArgumentCollection(); + + @Output + public PrintStream out; + + Map caseStatus; + + protected List aggregators; // no re-instantiation, use a list to ensure ordering + + protected Iterator locusIterator; + protected GenomeLoc iteratorLoc; + protected GenomeLoc loc; + protected String sample; + private List EMPTY_LIST = new ArrayList(0); + + public void initialize() { + if ( collection.windowSize % collection.windowJump != 0 ) { + throw new UserException("Window size is not divisible by window jump."); + } + + if ( collection.caseFile == null || collection.controlFile == null ) { + throw new UserException("You must provide both a case file (-case) and a control file (-control) each listing those samples belonging to the cohort"); + } + + caseStatus = new HashMap(getToolkit().getSAMFileSamples().size()); + try { + for ( String sample : new XReadLines(collection.caseFile) ) { + caseStatus.put(sample,true); + } + for ( String sample : new XReadLines(collection.controlFile)) { + caseStatus.put(sample,false); + } + + for ( Sample sample : getToolkit().getSAMFileSamples() ) { + if ( ! caseStatus.containsKey(sample.getId())) { + throw new UserException("No case/control status for sample "+sample.getId()); + } + } + + } catch ( FileNotFoundException e ) { + throw new UserException("Unable to open a case/control file",e); + } + + Set> aggregatorSet = getFeatureAggregators(collection.inputFeatures); + Set rfHolder1 = new HashSet(aggregatorSet.size()); + try { + for ( Class featureClass : aggregatorSet ) { + ReadFeatureAggregator readFeature = featureClass.getConstructor(RFAArgumentCollection.class).newInstance(collection); + rfHolder1.add(readFeature); + } + } catch ( Exception e ) { + throw new StingException("A read feature instantiation error occurred during initialization",e); + } + + ReadFeatureAggregator[] rfHolder2 = new ReadFeatureAggregator[rfHolder1.size()]; + int idx = 0; + for ( ReadFeatureAggregator f : rfHolder1 ) { + rfHolder2[idx++] = f; + } + Arrays.sort(rfHolder2, new Comparator() { + @Override + public int compare(ReadFeatureAggregator a, ReadFeatureAggregator b) { + return a.getClass().getSimpleName().compareTo(b.getClass().getSimpleName()); + } + }); + aggregators = Arrays.asList(rfHolder2); + + writeHeader(); + + locusIterator = getToolkit().getIntervals().iterator(); + iteratorLoc = locusIterator.hasNext() ? locusIterator.next() : null; + + } + + public RFWindow reduceInit() { + Set samples = new HashSet(getToolkit().getSamples().size()); + for ( Sample s : getToolkit().getSamples() ) { + samples.add(s.getId()); + } + return new RFWindow(aggregators,collection,caseStatus,getToolkit().getGenomeLocParser()); + } + + public SAMRecord map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { + if ( ref == null ) { return null; } // unmapped reads have null ref contexts + //loc = getToolkit().getGenomeLocParser().createGenomeLoc(ref.getLocus().getContig(),read.getAlignmentStart()); + GenomeLoc newLoc = ref.getLocus().getStartLocation(); // can be problematic if read aligns prior to start of contig -- should never happen + if ( newLoc.isPast(iteratorLoc.getStartLocation()) ) { + loc = newLoc; + } else { + loc = iteratorLoc.getStartLocation(); + } + if ( read == null ) { return null; } + sample = read.getReadGroup().getSample(); + return read; + } + + public RFWindow reduce(SAMRecord read, RFWindow prevReduce) { + if ( iteratorLoc != null && iteratorLoc.isBefore(loc) ) {// test if read is past end of the user interval + //logger.info(String.format("iteratorLoc: %s loc: %s",iteratorLoc.toString(),loc.toString())); + onIntervalDone(prevReduce); + iteratorLoc = locusIterator.hasNext() ? locusIterator.next() : null; + if ( loc.startsBefore(iteratorLoc) ) { + loc = iteratorLoc.getStartLocation(); + } + reduce(read,prevReduce); + } else if ( read != null ) { + // todo -- what happens if first read of an interval is not before or at the start of the interval?\ + List>>> completed = prevReduce.inc(read, loc, sample,iteratorLoc); + // todo -- run tests here; for now just log that a window/multiple windows are complete + if ( completed.size() > 0 ) { + // System.out.printf("At %s we have seen %d completed windows%n",loc,completed.size()) + // bed format + int locShift = 0; + for ( Pair>> samWindow : completed ) { + GenomeLoc window = samWindow.first; + runWindowTests(samWindow.second, window); + locShift += collection.windowJump; + } + } + } + return prevReduce; + } + + + public RFWindow onIntervalDone(RFWindow rWindow) { + //logger.info("In onIntervalDone at genome loc "+iteratorLoc.toString()+" with read loc "+loc.toString()); + List>>> completed = rWindow.flush(iteratorLoc); + int locShift = 0; + for ( Pair>> samWindow : completed ) { + GenomeLoc window = samWindow.first; + runWindowTests(samWindow.second, window); + locShift += collection.windowJump; + } + + return rWindow; + } + + public Set> getFeatureAggregators(List requestedFeatures) { + HashSet> newFeatureSet = new HashSet>(); + List> availableFeatures = new PluginManager(ReadFeatureAggregator.class).getPlugins(); + + if ( collection.inputFeatures == null ) { + newFeatureSet.addAll(availableFeatures); + return newFeatureSet; + } + + + Map> classNameToClass = new HashMap>(collection.inputFeatures.size()); + for ( Class clazz : availableFeatures ) { + classNameToClass.put(clazz.getSimpleName(),clazz); + } + + for ( String s : requestedFeatures) { + if ( classNameToClass.containsKey(s) ) { + newFeatureSet.add(classNameToClass.get(s)); + } else { + throw new UserException("The name "+s+" does not correspond to an available read feature class."); + } + } + + return newFeatureSet; + } + + public void runWindowTests(Map> window, GenomeLoc loc) { + // two main tests: fixed-significance shift, and confidence-interval sum + //System.out.printf("Running tests...%n"); + // todo -- really the aggregators should be iterated over directly (rather than indirectly through the index) + out.printf("%s\t%d\t%d",loc.getContig(),loc.getStart(),loc.getStop()); + for ( int agIdx = 0; agIdx < aggregators.size(); agIdx ++ ) { + double fixedDelta = fixedSignificance(window.get("case").get(agIdx),window.get("control").get(agIdx)); + //double weightedDelta = confidenceIntervalSum(window,agIdx); + //out.printf("\t%.2e\t%.2e",Double.isNaN(fixedDelta) ? 0.0 : fixedDelta,weightedDelta); + Pair,List> caseControlAffected = getAffectedSamples(window,agIdx,fixedDelta); + List cases = caseControlAffected.getFirst(); + List controls = caseControlAffected.getSecond(); + out.printf("\t%.2e\t%d:%d\t%.2e\t%s;%s",fixedDelta,cases.size(),controls.size(), MathUtils.binomialProbability(cases.size(),cases.size()+controls.size(),0.5),Utils.join(",",cases),Utils.join(",",controls)); + + } + out.printf("%n"); + } + + public Pair,List> getAffectedSamples(Map> aggregators, int idx, double fixedDelta) { + if ( fixedDelta == 0.0 ) { return new Pair,List>(EMPTY_LIST,EMPTY_LIST); } // todo -- too hacky + + Pair,List> ccSampleList = new Pair,List>(new ArrayList(), new ArrayList()); + for ( Map.Entry> entry : aggregators.entrySet() ) { + if ( entry.getKey().equals("case") || entry.getKey().equals("control")) { continue; } + ReadFeatureAggregator aggregator = entry.getValue().get(idx); + // is this sending a truly significant signal + double zs = (aggregator.getMean() - collection.EPSILON) * Math.sqrt(aggregator.getnReads())/Math.sqrt(aggregator.getUnbiasedVar()); + if ( zs > collection.sampleZThresh ) { + if ( caseStatus.get(entry.getKey()) ) { + ccSampleList.first.add(entry.getKey()); + } else { + ccSampleList.second.add(entry.getKey()); + } + } + } + + return ccSampleList; + } + + public double fixedSignificance(ReadFeatureAggregator caseAg, ReadFeatureAggregator controlAg) { + if ( caseAg.getnReads() == 0 || controlAg.getnReads() == 0 ) { + return 0.0; + } + double stat_num = caseAg.getMean() - controlAg.getMean(); + double stat_denom = Math.sqrt(caseAg.getUnbiasedVar()/caseAg.getnReads() + controlAg.getUnbiasedVar()/controlAg.getnReads()); + double stat = stat_num/stat_denom; + //System.out.printf("Mean_dif: %.2e Var: %.2e Stat: %.2f Z: %.2f SS-ZZ: %.2f%n",stat_num,stat_denom,stat, collection.fixedZ, stat*stat-collection.fixedZ*collection.fixedZ); + if ( stat*stat < collection.fixedZ*collection.fixedZ ) { + return 0.0; + } else { + //System.out.printf("Calculating delta: %.2f%n",(stat < 0) ? stat_denom*(-1*collection.fixedZ-stat) : stat_denom*(stat-collection.fixedZ)); + //return (stat > 0) ? stat_denom*(stat-collection.fixedZ) : stat_denom*(stat+collection.fixedZ); + return stat_num; + } + } + + public double confidenceIntervalSum(Map> window, int offset) { + // this comment serves as an explicit normality assumption (e.g. that the DF will be large) + double caseWeightMean = 0.0; + double caseWeightVar = 0.0; + double caseSumWeight = 0.0; + double controlWeightMean = 0.0; + double controlWeightVar = 0.0; + double controlSumWeight = 0.0; + for ( Map.Entry> sampleEntry : window.entrySet() ) { + if ( ! sampleEntry.getKey().equals("case") && ! sampleEntry.getKey().equals("control") ) { + // first check if the sample is shifted from zero (CI does not include zero) + // todo -- fixme. This will always be true for insert sizes, clipped reads, mapping quality...should have an avg value + + ReadFeatureAggregator aggregator = sampleEntry.getValue().get(offset); + if ( aggregator.getnReads() == 0 ) { + continue; + } + + boolean shifted; + int shiftThresh = 0;/* + // todo -- this is fucking awful + if ( aggregator instanceof InsertSize ) { + shiftThresh = 150; + } + if ( aggregator instanceof ClippedBases ) { + shiftThresh = 6; + }*/ + if ( aggregator.getMean() < shiftThresh ) { + // use fixedZ/4 to be a little bit less strenuous -- todo -- make an input? + shifted = aggregator.getMean() + collection.fixedZ/4*Math.sqrt(aggregator.getUnbiasedVar())/aggregator.getnReads() < shiftThresh; + } else { + shifted = aggregator.getMean() - collection.fixedZ/4*Math.sqrt(aggregator.getUnbiasedVar())/aggregator.getnReads() > shiftThresh;; + } + if ( shifted ) { + double twoS2 = 2*aggregator.getUnbiasedVar()*aggregator.getUnbiasedVar(); + if ( caseStatus.get(sampleEntry.getKey())) { + caseWeightMean += (aggregator.getnReads()-1.0)*aggregator.getMean()/(twoS2); + caseWeightVar += aggregator.getUnbiasedVar()*Math.pow((aggregator.getnReads()-1.0)/(twoS2),2); + caseSumWeight += (aggregator.getnReads()-1.0)/(twoS2); + } else { + controlWeightMean += (aggregator.getnReads()-1.0)*aggregator.getMean()/(twoS2); + controlWeightVar += aggregator.getUnbiasedVar()*Math.pow((aggregator.getnReads()-1.0)/(twoS2),2); + controlSumWeight += (aggregator.getnReads()-1.0)/(twoS2); + } + } + } + } + + double caseGaussianMean = caseWeightMean/caseSumWeight; + double controlGaussianMean = controlWeightMean/controlSumWeight; + double caseGaussianVar = caseWeightVar/(caseSumWeight*caseSumWeight); + double controlGaussianVar = controlWeightVar/(controlSumWeight*controlSumWeight); + // todo -- is the z-factor an appropriate statistic? + //return 1.0 - 3*(caseGaussianVar+controlGaussianVar)/Math.abs(caseGaussianMean-controlGaussianMean); + if ( caseGaussianMean > controlGaussianMean ) { + // want to examine the case lower fixedZ*stdev vs the control upper fixedZ*stev + return (caseGaussianMean-collection.fixedZ/4*Math.sqrt(caseGaussianVar)) - (controlGaussianMean + collection.fixedZ/4*Math.sqrt(controlGaussianVar)); + } else { + // want to examine the case upper fixedZ*stev vs the control lower fixedZ*stev + return (controlGaussianMean-collection.fixedZ/4*Math.sqrt(controlGaussianVar)) - ( caseGaussianMean + collection.fixedZ/4*Math.sqrt(caseGaussianVar)); + } + } + + public void writeHeader() { + // "%.2e\t%d:%d\t%s,%s\t%.2e" + StringBuffer buf = new StringBuffer(); + buf.append("description=chr,start,stop"); + for ( ReadFeatureAggregator f : aggregators ) { + buf.append(","); + buf.append(f.getClass().getSimpleName()); + buf.append("-d,"); + buf.append(f.getClass().getSimpleName()); + buf.append("-r,"); + buf.append(f.getClass().getSimpleName()); + buf.append("-s,"); + buf.append(f.getClass().getSimpleName()); + buf.append("-p"); + } + out.printf("track type=bedTable %s%n",buf); + } +} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/RFAWalker.java~ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/RFAWalker.java~ new file mode 100755 index 000000000..595e48cc4 --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/RFAWalker.java~ @@ -0,0 +1,264 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.newassociation; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.commandline.ArgumentCollection; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.datasources.sample.Sample; +import org.broadinstitute.sting.gatk.filters.*; +import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.ReadFilters; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features.*; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.classloader.PluginManager; +import org.broadinstitute.sting.utils.exceptions.StingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.text.XReadLines; + +import java.io.FileNotFoundException; +import java.io.PrintStream; +import java.util.*; + +/** + * Read feature association walker -- associates read features between dichotomized, or multi-group cohorts + * todo -- need a heuristic to stop doing tests where there is certainly no signal + * todo -- for most features there's a nuisance variable which is the proportion of *paired* reads, perhaps a pair-only setting for read features + */ + +@ReadFilters({MaxInsertSizeFilter.class,MappingQualityReadFilter.class}) +public class RFAWalker extends ReadWalker { + // todo -- this needs to be an argument collection that can get passed around to initialize read features etc + @ArgumentCollection + private RFAArgumentCollection collection = new RFAArgumentCollection(); + + @Output + public PrintStream out; + + Map caseStatus; + + protected List aggregators; // no re-instantiation, use a list to ensure ordering + + protected GenomeLoc loc; + protected String sample; + + public void initialize() { + if ( collection.windowSize % collection.windowJump != 0 ) { + throw new UserException("Window size is not divisible by window jump."); + } + + if ( collection.caseFile == null || collection.controlFile == null ) { + throw new UserException("You must provide both a case file (-case) and a control file (-control) each listing those samples belonging to the cohort"); + } + + caseStatus = new HashMap(getToolkit().getSAMFileSamples().size()); + try { + for ( String sample : new XReadLines(collection.caseFile) ) { + caseStatus.put(sample,true); + } + for ( String sample : new XReadLines(collection.controlFile)) { + caseStatus.put(sample,false); + } + + for ( Sample sample : getToolkit().getSAMFileSamples() ) { + if ( ! caseStatus.containsKey(sample.getId())) { + throw new UserException("No case/control status for sample "+sample.getId()); + } + } + + } catch ( FileNotFoundException e ) { + throw new UserException("Unable to open a case/control file",e); + } + + Set> aggregatorSet = getFeatureAggregators(collection.inputFeatures); + Set rfHolder1 = new HashSet(aggregatorSet.size()); + try { + for ( Class featureClass : aggregatorSet ) { + ReadFeatureAggregator readFeature = featureClass.getConstructor(RFAArgumentCollection.class).newInstance(collection); + rfHolder1.add(readFeature); + } + } catch ( Exception e ) { + throw new StingException("A read feature instantiation error occurred during initialization",e); + } + + ReadFeatureAggregator[] rfHolder2 = new ReadFeatureAggregator[rfHolder1.size()]; + int idx = 0; + for ( ReadFeatureAggregator f : rfHolder1 ) { + rfHolder2[idx++] = f; + } + Arrays.sort(rfHolder2, new Comparator() { + @Override + public int compare(ReadFeatureAggregator a, ReadFeatureAggregator b) { + return a.getClass().getSimpleName().compareTo(b.getClass().getSimpleName()); + } + }); + aggregators = Arrays.asList(rfHolder2); + + writeHeader(); + + } + + public ReadFeatureWindow reduceInit() { + Set samples = new HashSet(getToolkit().getSamples().size()); + for ( Sample s : getToolkit().getSamples() ) { + samples.add(s.getId()); + } + return new ReadFeatureWindow(aggregators,caseStatus,collection); + } + + public SAMRecord map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { + if ( ref == null ) { return null; } // unmapped reads have null ref contexts + loc = getToolkit().getGenomeLocParser().createGenomeLoc(ref.getLocus().getContig(),read.getAlignmentStart()); + sample = read.getReadGroup().getSample(); + return read; + } + + public ReadFeatureWindow reduce(SAMRecord read, ReadFeatureWindow prevReduce) { + if ( read != null ) { + List>> completed = prevReduce.inc(read,loc,sample); + // todo -- run tests here; for now just log that a window/multiple windows are complete + if ( completed.size() > 0 ) { + // System.out.printf("At %s we have seen %d completed windows%n",loc,completed.size()) + // bed format + int locShift = 0; + for ( Map> samWindow : completed ) { + GenomeLoc window = getToolkit().getGenomeLocParser().createGenomeLoc(loc.getContig(),loc.getStart()-52+locShift,loc.getStop()-1); + runWindowTests(samWindow, window); + locShift += collection.windowJump; + } + } + } + return prevReduce; + } + + public Set> getFeatureAggregators(List requestedFeatures) { + HashSet> newFeatureSet = new HashSet>(); + List> availableFeatures = new PluginManager(ReadFeatureAggregator.class).getPlugins(); + + if ( collection.inputFeatures == null ) { + newFeatureSet.addAll(availableFeatures); + return newFeatureSet; + } + + + Map> classNameToClass = new HashMap>(collection.inputFeatures.size()); + for ( Class clazz : availableFeatures ) { + classNameToClass.put(clazz.getSimpleName(),clazz); + } + + for ( String s : requestedFeatures) { + if ( classNameToClass.containsKey(s) ) { + newFeatureSet.add(classNameToClass.get(s)); + } else { + throw new UserException("The name "+s+" does not correspond to an available read feature class."); + } + } + + return newFeatureSet; + } + + public void runWindowTests(Map> window, GenomeLoc loc) { + // two main tests: fixed-significance shift, and confidence-interval sum + //System.out.printf("Running tests...%n"); + out.printf("%s\t%d\t%d",loc.getContig(),loc.getStart(),loc.getStop()); + for ( int agIdx = 0; agIdx < aggregators.size(); agIdx ++ ) { + double fixedDelta = fixedSignificance(window.get("case").get(agIdx),window.get("control").get(agIdx)); + double weightedDelta = confidenceIntervalSum(window,agIdx); + out.printf("\t%.2e\t%.2e",Double.isNaN(fixedDelta) ? 0.0 : fixedDelta,weightedDelta); + } + out.printf("%n"); + } + + public double fixedSignificance(ReadFeatureAggregator caseAg, ReadFeatureAggregator controlAg) { + if ( caseAg.getnReads() == 0 || controlAg.getnReads() == 0 ) { + return 0.0; + } + double stat_num = caseAg.getMean() - controlAg.getMean(); + double stat_denom = Math.sqrt(caseAg.getUnbiasedVar()/caseAg.getnReads() + controlAg.getUnbiasedVar()/controlAg.getnReads()); + double stat = stat_num/stat_denom; + //System.out.printf("Mean_dif: %.2e Var: %.2e Stat: %.2f Z: %.2f SS-ZZ: %.2f%n",stat_num,stat_denom,stat, collection.fixedZ, stat*stat-collection.fixedZ*collection.fixedZ); + if ( stat*stat < collection.fixedZ*collection.fixedZ ) { + return 0.0; + } else { + //System.out.printf("Calculating delta: %.2f%n",(stat < 0) ? stat_denom*(-1*collection.fixedZ-stat) : stat_denom*(stat-collection.fixedZ)); + //return (stat > 0) ? stat_denom*(stat-collection.fixedZ) : stat_denom*(stat+collection.fixedZ); + return stat; + } + } + + public double confidenceIntervalSum(Map> window, int offset) { + // this comment serves as an explicit normality assumption (e.g. that the DF will be large) + double caseWeightMean = 0.0; + double caseWeightVar = 0.0; + double caseSumWeight = 0.0; + double controlWeightMean = 0.0; + double controlWeightVar = 0.0; + double controlSumWeight = 0.0; + for ( Map.Entry> sampleEntry : window.entrySet() ) { + if ( ! sampleEntry.getKey().equals("case") && ! sampleEntry.getKey().equals("control") ) { + // first check if the sample is shifted from zero (CI does not include zero) + // todo -- fixme. This will always be true for insert sizes, clipped reads, mapping quality...should have an avg value + + ReadFeatureAggregator aggregator = sampleEntry.getValue().get(offset); + if ( aggregator.getnReads() == 0 ) { + continue; + } + + boolean shifted; + int shiftThresh = 0; + // todo -- this is fucking awful + if ( aggregator instanceof InsertSize ) { + shiftThresh = 150; + } + if ( aggregator instanceof ClippedBases ) { + shiftThresh = 6; + } + if ( aggregator.getMean() < shiftThresh ) { + // use fixedZ/4 to be a little bit less strenuous -- todo -- make an input? + shifted = aggregator.getMean() + collection.fixedZ/4*Math.sqrt(aggregator.getUnbiasedVar())/aggregator.getnReads() < shiftThresh; + } else { + shifted = aggregator.getMean() - collection.fixedZ/4*Math.sqrt(aggregator.getUnbiasedVar())/aggregator.getnReads() > shiftThresh;; + } + if ( shifted ) { + double twoS2 = 2*aggregator.getUnbiasedVar()*aggregator.getUnbiasedVar(); + if ( caseStatus.get(sampleEntry.getKey())) { + caseWeightMean += (aggregator.getnReads()-1.0)*aggregator.getMean()/(twoS2); + caseWeightVar += aggregator.getUnbiasedVar()*Math.pow((aggregator.getnReads()-1.0)/(twoS2),2); + caseSumWeight += (aggregator.getnReads()-1.0)/(twoS2); + } else { + controlWeightMean += (aggregator.getnReads()-1.0)*aggregator.getMean()/(twoS2); + controlWeightVar += aggregator.getUnbiasedVar()*Math.pow((aggregator.getnReads()-1.0)/(twoS2),2); + controlSumWeight += (aggregator.getnReads()-1.0)/(twoS2); + } + } + } + } + + double caseGaussianMean = caseWeightMean/caseSumWeight; + double controlGaussianMean = controlWeightMean/controlSumWeight; + double caseGaussianVar = caseWeightVar/(caseSumWeight*caseSumWeight); + double controlGaussianVar = controlWeightVar/(controlSumWeight*controlSumWeight); + // todo -- is the z-factor an appropriate statistic? + //return 1.0 - 3*(caseGaussianVar+controlGaussianVar)/Math.abs(caseGaussianMean-controlGaussianMean); + if ( caseGaussianMean > controlGaussianMean ) { + // want to examine the case lower fixedZ*stdev vs the control upper fixedZ*stev + return (caseGaussianMean-collection.fixedZ/4*Math.sqrt(caseGaussianVar)) - (controlGaussianMean + collection.fixedZ/4*Math.sqrt(controlGaussianVar)); + } else { + // want to examine the case upper fixedZ*stev vs the control lower fixedZ*stev + return (controlGaussianMean-collection.fixedZ/4*Math.sqrt(controlGaussianVar)) - ( caseGaussianMean + collection.fixedZ/4*Math.sqrt(caseGaussianVar)); + } + } + + public void writeHeader() { + StringBuffer buf = new StringBuffer(); + buf.append("description=chr,start,stop"); + for ( ReadFeatureAggregator f : aggregators ) { + buf.append(","); + buf.append(f.getClass().getSimpleName()); + buf.append("-d,"); + buf.append(f.getClass().getSimpleName()); + buf.append("-c"); + } + out.printf("track type=bedTable %s%n",buf); + } +} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/RFCombineWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/RFCombineWalker.java new file mode 100755 index 000000000..37dd900ec --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/RFCombineWalker.java @@ -0,0 +1,105 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.newassociation; + +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.features.table.TableCodec; +import org.broadinstitute.sting.gatk.refdata.features.table.TableFeature; +import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.exceptions.StingException; + +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: 5/23/11 + * Time: 5:22 PM + * To change this template use File | Settings | File Templates. + */ +public class RFCombineWalker extends RodWalker { + + private static final String FIRST_COL = "chrm:start-stop"; + + @Output + PrintStream out; + + private List order; + + private GenomeLoc prevLoc; + + public void initialize() { + order = new ArrayList(getToolkit().getRodDataSources().size()); + StringBuffer header = new StringBuffer(); + header.append(FIRST_COL); + for ( ReferenceOrderedDataSource rSource : getToolkit().getRodDataSources() ) { + if ( rSource.getRecordType().isAssignableFrom(TableFeature.class) ) { + //System.out.println(rSource.getHeader().toString()); + for ( String entry : (Collection) rSource.getHeader() ) { + if ( ! entry.startsWith("HEADER") ) { + header.append("\t"); + header.append(entry); + } + } + order.add(rSource.getName()); + } + } + + out.printf("%s%n",header); + + prevLoc = null; + } + + public Object reduceInit() { return null; } + + public Object map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( tracker == null || ref == null ) { return null;} + + GenomeLoc loc = null; + boolean needPrint = false; + List eventBySample = new ArrayList(); + + for ( String rodName : order ) { + List namedMD = tracker.getReferenceMetaData(rodName,true); + TableFeature feature = null; + if ( namedMD.size() > 0 ) { + feature = namedMD.get(0) instanceof TableFeature ? (TableFeature) namedMD.get(0) : null; + } + + if ( feature == null ) { throw new StingException("This should be an instance of TableFeature, no?"); } + + loc = feature.getLocation(); + if ( prevLoc != null && loc.equals(prevLoc) ) { + break; + } + + for ( String s : feature.getAllValues().subList(1,feature.getAllValues().size()) ) { + boolean has = ! (s.charAt(0) == '0'); + eventBySample.add(s); + needPrint |= has; + } + } + + if ( needPrint && (loc != null)) { + out.printf("%s",loc.toString()); + for ( String s : eventBySample ) { + out.printf("\t%s", s); + } + out.printf("%n"); + } + + prevLoc = loc; + + return null; + + } + + public Object reduce(Object map, Object reduce) { return null; } +} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/RFDumperWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/RFDumperWalker.java new file mode 100755 index 000000000..c146311cd --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/RFDumperWalker.java @@ -0,0 +1,126 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.newassociation; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.commandline.ArgumentCollection; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.datasources.sample.Sample; +import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features.*; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.classloader.PluginManager; +import org.broadinstitute.sting.utils.exceptions.StingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.text.XReadLines; + +import java.io.FileNotFoundException; +import java.io.PrintStream; +import java.util.*; + +/** + * A (currently very lame) utility to dump read feature information to a file for validity checking of read feature extractors. + * Soon to be extended for aggregator and window validation; as well as postspective signal analysis. + */ +public class RFDumperWalker extends ReadWalker { + @ArgumentCollection + private RFAArgumentCollection collection = new RFAArgumentCollection(); + + List aggregators; + GenomeLoc loc; + boolean paired; + String sample; + String name; + + String[] data; + + @Output + PrintStream out; + + public void initialize() { + + Set> aggregatorSet = getFeatureAggregators(collection.inputFeatures); + Set rfHolder1 = new HashSet(aggregatorSet.size()); + try { + for ( Class featureClass : aggregatorSet ) { + ReadFeatureAggregator readFeature = featureClass.getConstructor(collection.getClass()).newInstance(collection); + rfHolder1.add(readFeature); + } + } catch ( Exception e ) { + throw new StingException("A read feature instantiation error occurred during initialization",e); + } + + ReadFeatureAggregator[] rfHolder2 = new ReadFeatureAggregator[rfHolder1.size()]; + int idx = 0; + for ( ReadFeatureAggregator f : rfHolder1 ) { + rfHolder2[idx++] = f; + } + Arrays.sort(rfHolder2, new Comparator() { + @Override + public int compare(ReadFeatureAggregator a, ReadFeatureAggregator b) { + return a.getClass().getSimpleName().compareTo(b.getClass().getSimpleName()); + } + }); + aggregators = Arrays.asList(rfHolder2); + for ( ReadFeatureAggregator ag : aggregators ) { + logger.info(ag.getClass().getSimpleName()); + } + + data = new String[aggregators.size()]; + } + + public Set> getFeatureAggregators(List requestedFeatures) { + HashSet> newFeatureSet = new HashSet>(); + List> availableFeatures = new PluginManager(ReadFeatureAggregator.class).getPlugins(); + + if ( collection.inputFeatures == null ) { + newFeatureSet.addAll(availableFeatures); + return newFeatureSet; + } + + + Map> classNameToClass = new HashMap>(collection.inputFeatures.size()); + for ( Class clazz : availableFeatures ) { + classNameToClass.put(clazz.getSimpleName(),clazz); + } + + for ( String s : requestedFeatures) { + if ( classNameToClass.containsKey(s) ) { + newFeatureSet.add(classNameToClass.get(s)); + } else { + throw new UserException("The name "+s+" does not correspond to an available read feature class."); + } + } + + return newFeatureSet; + } + + public ReadFeatureWindow reduceInit() { return null; } + + public String[] map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { + // TODO: THIS WILL BREAK IF FEATURE REQUIRES PAIRED READ + if ( ref == null ) { return null; } // unmapped reads have null ref contexts + loc = getToolkit().getGenomeLocParser().createGenomeLoc(ref.getLocus().getContig(),read.getAlignmentStart()); + sample = read.getReadGroup().getSample(); + name = read.getReadName(); + paired = read.getReadPairedFlag() && ! read.getMateUnmappedFlag(); + int idx = 0; + for ( ReadFeatureAggregator aggregator : aggregators) { + data[idx++] = String.format("%s: %s",aggregator.getClass().getSimpleName(),aggregator.parseStr(read)); + } + return data; + } + + public ReadFeatureWindow reduce(String[] map, ReadFeatureWindow prevReduce) { + if ( map == null ) { return null; } + StringBuffer fStrBuilder = new StringBuffer(); + for ( String f : map ) { + fStrBuilder.append("\t"); + fStrBuilder.append(f); + } + + out.printf("%s\t%s\t%s%s%n",loc,name,sample,fStrBuilder); + + return null; + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/RFExtractorWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/RFExtractorWalker.java new file mode 100755 index 000000000..dfc5acc3b --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/RFExtractorWalker.java @@ -0,0 +1,243 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.newassociation; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.ArgumentCollection; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.datasources.sample.Sample; +import org.broadinstitute.sting.gatk.filters.*; +import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.By; +import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.walkers.ReadFilters; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features.ClippedBases; +import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features.InsertSize; +import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features.ReadFeatureAggregator; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.classloader.PluginManager; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.StingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.text.XReadLines; + +import java.io.FileNotFoundException; +import java.io.PrintStream; +import java.util.*; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: 5/12/11 + * Time: 1:48 PM + * To change this template use File | Settings | File Templates. + */ +@ReadFilters({MaxInsertSizeFilter.class,MappingQualityReadFilter.class,DuplicateReadFilter.class,FailsVendorQualityCheckReadFilter.class,NotPrimaryAlignmentReadFilter.class,UnmappedReadFilter.class}) +@By(DataSource.REFERENCE) +public class RFExtractorWalker extends ReadWalker { + + @ArgumentCollection + public RFAArgumentCollection rfaArgs = new RFAArgumentCollection(); + + @Argument(doc="Set the marker threshold to this.",shortName="mmt",fullName="markerModeThreshold",required=false) + public double markerModeThreshold = 0.05; + + @Argument(doc="Turn on marker mode (1 if sample is significantly greater than the threshold, 0 otherwise)",shortName="mm",fullName="markerMode",required=false) + public boolean markerMode = false; + + @Argument(doc="Turn on raw count mode: output will be raw aberrant read counts",shortName="c",fullName="count",required=false) + public boolean countMode = false; + + + @Output + public PrintStream out; + + protected Iterator locusIterator; + protected GenomeLoc iteratorLoc; + protected GenomeLoc loc; + protected String sample; + + public void initialize() { + if ( rfaArgs.windowSize % rfaArgs.windowJump != 0 ) { + throw new UserException("Window size is not divisible by window jump."); + } + + if ( markerMode && markerModeThreshold < 0.0 ) { + throw new UserException("Cannot have a negative threshold when using marker mode"); + } + + if ( countMode && markerMode ) { + throw new UserException("Cannot be both in count mode and marker mode"); + } + + locusIterator = getToolkit().getIntervals().iterator(); + iteratorLoc = locusIterator.next(); + } + + public RFWindow reduceInit() { + Map allCase = new HashMap(getToolkit().getSamples().size()); + for ( Sample s : getToolkit().getSAMFileSamples() ) { + allCase.put(s.getId(),true); + if ( s.getId() == null || s.getId().equals("null") ) { + throw new StingException("Sample IDs must not be null... " + s.toString() + " " + Boolean.toString(s.hasSAMFileEntry())); + } + } + + Set> aggregatorSet = getFeatureAggregators(rfaArgs.inputFeatures); + Set rfHolder1 = new HashSet(aggregatorSet.size()); + try { + for ( Class featureClass : aggregatorSet ) { + ReadFeatureAggregator readFeature = featureClass.getConstructor(RFAArgumentCollection.class).newInstance(rfaArgs); + rfHolder1.add(readFeature); + } + } catch ( Exception e ) { + throw new StingException("A read feature instantiation error occurred during initialization",e); + } + + ReadFeatureAggregator[] rfHolder2 = new ReadFeatureAggregator[rfHolder1.size()]; + int idx = 0; + for ( ReadFeatureAggregator f : rfHolder1 ) { + rfHolder2[idx++] = f; + } + Arrays.sort(rfHolder2, new Comparator() { + @Override + public int compare(ReadFeatureAggregator a, ReadFeatureAggregator b) { + return a.getClass().getSimpleName().compareTo(b.getClass().getSimpleName()); + } + }); + + List aggregators = Arrays.asList(rfHolder2); + + out.printf("HEADERchrm:start-stop"); + for ( String s : allCase.keySet() ) { + for ( ReadFeatureAggregator rfa : aggregators ) { + out.printf("\t%s.%s",s,rfa.getClass().getSimpleName()); + } + } + out.printf("%n"); + + return new RFWindow(aggregators,rfaArgs,allCase,getToolkit().getGenomeLocParser()); + } + + public SAMRecord map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { + if ( ref == null ) { return null; } // unmapped reads have null ref contexts + //loc = getToolkit().getGenomeLocParser().createGenomeLoc(ref.getLocus().getContig(),read.getAlignmentStart()); + GenomeLoc newLoc = ref.getLocus().getStartLocation(); // can be problematic if read aligns prior to start of contig -- should never happen + if ( newLoc.isPast(iteratorLoc.getStartLocation()) ) { + loc = newLoc; + } else { + loc = iteratorLoc.getStartLocation(); + } + if ( read == null ) { return null; } + sample = read.getReadGroup().getSample(); + return read; + } + + public RFWindow reduce(SAMRecord read, RFWindow prevReduce) { + if ( iteratorLoc != null && iteratorLoc.isBefore(loc) ) {// test if read is past end of the user interval + //logger.info(String.format("iteratorLoc: %s loc: %s",iteratorLoc.toString(),loc.toString())); + onIntervalDone(prevReduce); + iteratorLoc = locusIterator.hasNext() ? locusIterator.next() : null; + if ( loc.startsBefore(iteratorLoc) ) { + loc = iteratorLoc.getStartLocation(); + } + reduce(read,prevReduce); + } else if ( read != null ) { + // todo -- what happens if first read of an interval is not before or at the start of the interval? + + List>>> completed = prevReduce.inc(read, loc, sample,iteratorLoc); + if ( completed.size() > 0 ) { + // System.out.printf("At %s we have seen %d completed windows%n",loc,completed.size()) + // bed format + for ( Pair>> samWindow : completed ) { + GenomeLoc window = samWindow.first; + /*if ( prevPrint == null ) { + prevPrint = window; + } else if ( window.startsBefore(prevPrint) ) { + throw new StingException(String.format("Attempting to print at %s after having printed a record at %s",window.toString(),prevPrint.toString())); + } else { + prevPrint = window; + }*/ + out.printf("%s",window.toString()); + for ( Map.Entry> samEntry : samWindow.second.entrySet() ) { + for ( ReadFeatureAggregator aggregator : samEntry.getValue() ) { + if ( ! markerMode && ! countMode ) { + out.printf("\t%.5e,%d",aggregator.getMean(),aggregator.getnReads()); + } else if ( markerMode ) { + out.printf("\t%d",hasEvent(aggregator,markerModeThreshold,rfaArgs.fixedZ) ? 1 : 0); + } else if ( countMode ) { + out.printf("\t%d", MathUtils.fastRound(aggregator.getMean()*aggregator.getnReads())); + } + } + } + out.printf("%n"); + } + } + } else { + prevReduce.inc(null,loc,null,iteratorLoc); + } + return prevReduce; + } + + public RFWindow onIntervalDone(RFWindow rWindow) { + //logger.info("In onIntervalDone at genome loc "+iteratorLoc.toString()+" with read loc "+loc.toString()); + List>>> completed = rWindow.flush(iteratorLoc); + for ( Pair>> samWindow : completed ) { + GenomeLoc window = samWindow.first; + /*if ( prevPrint == null ) { + prevPrint = window; + } else if ( window.startsBefore(prevPrint) ) { + throw new StingException(String.format("Attempting to print at %s after having printed a record at %s",window.toString(),prevPrint.toString())); + } else { + prevPrint = window; + }*/ + out.printf("%s",window.toString()); + for ( Map.Entry> samEntry : samWindow.second.entrySet() ) { + for ( ReadFeatureAggregator aggregator : samEntry.getValue() ) { + if ( ! markerMode && ! countMode ) { + out.printf("\t%.5e,%d",aggregator.getMean(),aggregator.getnReads()); + } else if ( markerMode ) { + out.printf("\t%d",hasEvent(aggregator,markerModeThreshold,rfaArgs.fixedZ) ? 1 : 0); + } else if ( countMode ) { + out.printf("\t%d", MathUtils.fastRound(aggregator.getMean()*aggregator.getnReads())); + } + } + } + out.printf("%n"); + } + + return rWindow; + } + + public Set> getFeatureAggregators(List requestedFeatures) { + HashSet> newFeatureSet = new HashSet>(); + List> availableFeatures = new PluginManager(ReadFeatureAggregator.class).getPlugins(); + + if ( rfaArgs.inputFeatures == null ) { + newFeatureSet.addAll(availableFeatures); + return newFeatureSet; + } + + + Map> classNameToClass = new HashMap>(rfaArgs.inputFeatures.size()); + for ( Class clazz : availableFeatures ) { + classNameToClass.put(clazz.getSimpleName(),clazz); + } + + for ( String s : requestedFeatures) { + if ( classNameToClass.containsKey(s) ) { + newFeatureSet.add(classNameToClass.get(s)); + } else { + throw new UserException("The name "+s+" does not correspond to an available read feature class."); + } + } + + return newFeatureSet; + } + + public static boolean hasEvent(ReadFeatureAggregator aggregator, double lowThresh, double sigLevel) { + return (aggregator.getMean() - lowThresh)*Math.sqrt(aggregator.getnReads())/aggregator.getVar() > sigLevel; + } +} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/RFWindow.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/RFWindow.java new file mode 100755 index 000000000..ae3f40c77 --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/RFWindow.java @@ -0,0 +1,243 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.newassociation; + +import com.google.java.contract.Requires; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features.ReadFeatureAggregator; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.StingException; + +import java.util.*; + +/** + * Created by IntelliJ IDEA. + * User: chartl + */ +public class RFWindow { + + private RFAArgumentCollection argumentCollection; // the RF Argument Collection from the RF Walker + private List windowStarts; // holds the starting positions of the windows + private Map sampleCohortMap; // identifies case samples ("true") and control samples ("false") + List aggregators; // read feature aggregators to be used (maintained for fast cloning, and ordering) + // feature ---> sample ---> count + List>> aggregatorWindows; // holds the map between samples and features in the current windows + private GenomeLocParser parser; // the active parser, for creating GenomeLocs for empty windows + private GenomeLoc previousLoc; // the previous genome loc within the user interval given to the RFWindow + + /** + * Defines a new window which maps samples to read feature aggregators + * @return - a new element for aggregatorWindows + */ + private Map> newWindow() { + Map> win = new HashMap>(sampleCohortMap.size()); + for ( String s : sampleCohortMap.keySet() ) { + win.put(s,getAggregators()); + } + // todo -- generalize me + win.put("case",getAggregators()); + win.put("control",getAggregators()); + + return win; + } + + /** + * Generates a list of new aggregators to collect data + * @return list of ReadFeatureAggregators to be the value of a new window + */ + private List getAggregators() { + ArrayList newEmptyAgs = new ArrayList(aggregators.size()); + try { + for ( ReadFeatureAggregator ag : aggregators ) { + newEmptyAgs.add(ag.getClass().getConstructor(RFAArgumentCollection.class).newInstance(argumentCollection)); + } + } catch (Exception e) { + throw new StingException("Error instantiating read feature aggregator",e); + } + + return newEmptyAgs; + } + + /** + * A constructor for the RFWindow object + * @param collection - the argument collection, from the walker + * @param cohortMap - the map between sample IDs and whether or not it is a case sample, from the walker + * @param parser - the Genome Loc Parser, from the walker + */ + public RFWindow(List aggregators, RFAArgumentCollection collection, Map cohortMap, GenomeLocParser parser) { + this.argumentCollection = collection; + this.sampleCohortMap = cohortMap; + this.parser = parser; + this.aggregators = aggregators; + } + + /** + * Instantiate the tiled windows of sample -> List maps, filling in empty windows up to the one which contains the + * provided genomeloc if necessary. + * @param loc - the location to fill up to, usually the starting position of a read or an interval + * @param currentUserInterval - the current user-provided interval being processed by the walker + */ + @Requires({"! currentUserInterval.isBefore(loc)"}) + public void instantiate(GenomeLoc loc, GenomeLoc currentUserInterval) { + aggregatorWindows = new ArrayList>>(argumentCollection.windowSize/argumentCollection.windowJump); + windowStarts = new ArrayList(argumentCollection.windowSize/argumentCollection.windowJump); + //System.out.printf("Calling fill at %s\t%s%n",loc,currentUserInterval); + this.fill(loc, currentUserInterval); + } + + /** + * Fills the tiled window lists with empty windows up to the one which includes the locus + * todo -- this can take a lot of memory for large intervals instantiated far into them, e.g. + * |---------------------------------------------------------------------| interval + * . <=== locus + * todo -- perhaps a reduced representation for empty windows that were not created but already have expired? + * @param loc - the location to fill up to, usually the starting position of a read or an interval + * @param currentUserInterval - the current user-provided interval being processed by the walker + */ + @Requires({"! currentUserInterval.isBefore(loc)"}) + public void fill(GenomeLoc loc, GenomeLoc currentUserInterval) { + // case -1: window could be empty + if ( windowStarts == null ) { + instantiate(loc,currentUserInterval); + } + + // case 0: if the windows are empty add in the beginning of the interval + if ( windowStarts.size() == 0 ) { + windowStarts.add(currentUserInterval.getStartLocation()); + aggregatorWindows.add(newWindow()); + } + + // case 1: loc is before or within windowJump bases of the current user interval; we need only instantiate the first window + if ( loc.isBefore(currentUserInterval) || loc.distance(currentUserInterval) <= argumentCollection.windowJump ) { + // do nothing at all + } else { + // case 2: loc is somewhere in the middle or at the end of current user interval, need to fill in windows up until then + GenomeLoc nextLoc = windowStarts.get(windowStarts.size()-1); + while ( loc.distance(nextLoc) > argumentCollection.windowJump ) { + //System.out.printf("Filling with nextloc %s%n",nextLoc); + nextLoc = shiftLocByJump(windowStarts.get(windowStarts.size()-1),argumentCollection.windowJump); + windowStarts.add(nextLoc); + aggregatorWindows.add(newWindow()); + } + } + + } + + /** + * Shifts a location from chrZ:X to chrZ:X+jump + * @param loc - location to shift + * @param jump - amount to shift by + * @return loc with start position shifted by jump + */ + @Requires("loc.getStart()+jump < parser.getContigInfo(loc.getContig()).getSequenceLength()") // e.g. that the new loc is not off the end of the contig + private GenomeLoc shiftLocByJump(GenomeLoc loc, int jump) { + return parser.createGenomeLoc(loc.getContig(),loc.getStart()+jump); + } + + /** + * Fills in missing windows between the previously-seen one and the ending interval, then expires all windows, returning + * them as complete, resetting previousLocation to null so that the next read re-instantiates internal window data + * @param userIntervalEnding - the user interval which is ending + * @return those currently active windows, plus empty interpolating windows between the last active window and the end of the interval + */ + public List>>> flush(GenomeLoc userIntervalEnding) { + // jump in locations -- flush the windows + List>>> complete = new ArrayList>>>(aggregators.size()); + // fill in any uncovered windows + GenomeLoc iStop = userIntervalEnding.getStopLocation(); + //System.out.printf("Calling fill from within flush at %s%n",userIntervalEnding); + fill(iStop,userIntervalEnding); + // now expire all windows, terminating them either at the proper window size, or at the endpoint of the interval if it comes sooner + while ( windowStarts.size() > 0 ) { + Map> cMap = aggregatorWindows.remove(0); + GenomeLoc wsLoc = windowStarts.remove(0); + + GenomeLoc cLoc; + if ( wsLoc.distance(iStop) > argumentCollection.windowSize ) { + cLoc = parser.createGenomeLoc(wsLoc.getContig(),wsLoc.getStart(),wsLoc.getStart()+argumentCollection.windowSize); + } else { + cLoc = wsLoc.endpointSpan(iStop); + } + + complete.add(new Pair>>(cLoc,cMap)); + } + + previousLoc = null; // will re-instantiate data upon next loc + + return complete; + } + + /** + * Workhorse method: + * - determines if new windows need be made + * - determines if old windows need be tested & trashed + * - determines which windows need to be updated & updates them + * + * @param loc - starting alignment position of the read + * @param sample - the sample ID from whom the read was sequenced + * @return - those feature window(s) that need be tested (and then baleeted) + */ + public List>>> inc(SAMRecord record, GenomeLoc loc, String sample, GenomeLoc userInterval) { + List>>> complete = new ArrayList>>>(aggregators.size()); + if ( previousLoc == null ) { + // first time, gotta instantiate stuff + instantiate(loc,userInterval); + windowInc(sample,record,aggregatorWindows.get(0)); + } else if ( loc.distance(previousLoc) == 0 ) { + // best and most common case: just update the living windows + for ( Map> win : aggregatorWindows ) { + windowInc(sample,record,win); + } + } else { + // another standard case: we've gone to some further base in the same interval + while ( loc.distance(windowStarts.get(windowStarts.size()-1)) > argumentCollection.windowJump ) { + // careful, don't use the location itself, but add in windows every winJump bases until this condition is not met + //System.out.printf("Adding within inc at %s\t%s%n",loc,userInterval); + windowStarts.add(shiftLocByJump(windowStarts.get(windowStarts.size()-1),argumentCollection.windowJump)); + aggregatorWindows.add(newWindow()); + } + + while ( windowStarts.size() > 0 && loc.distance(windowStarts.get(0)) > argumentCollection.windowSize ) { + Map> cMap = aggregatorWindows.remove(0); + GenomeLoc wsLoc = windowStarts.remove(0); + GenomeLoc iStop = userInterval.getStopLocation(); + GenomeLoc cLoc; + if ( wsLoc.distance(iStop) > argumentCollection.windowSize ) { + cLoc = parser.createGenomeLoc(wsLoc.getContig(),wsLoc.getStart(),wsLoc.getStart()+argumentCollection.windowSize); + } else { + cLoc = wsLoc.endpointSpan(iStop); + } + complete.add(new Pair>>(cLoc,cMap)); + } + + for ( Map> win : aggregatorWindows ) { + windowInc(sample,record,win); + } + + } + + previousLoc = loc; + + return complete; + } + + /** + * Incorporate new features into the window + * @param sample - id of sample from which the features come + * @param record - the read + * @param window - the particular window to be updated + */ + private void windowInc(String sample, SAMRecord record, Map> window) { + if ( sample == null || record == null ) { return; } + + for (ReadFeatureAggregator aggregator : window.get(sample) ) { + aggregator.aggregate(record); + } + + for ( ReadFeatureAggregator aggregator : window.get( sampleCohortMap.get(sample) ? "case" : "control") ) { + aggregator.aggregate(record); + } + } + +} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/RFWindow.java~ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/RFWindow.java~ new file mode 100755 index 000000000..cd65ad032 --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/RFWindow.java~ @@ -0,0 +1,227 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.newassociation; + +import com.google.java.contract.Requires; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features.ReadFeatureAggregator; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.StingException; + +import java.util.*; + +/** + * Created by IntelliJ IDEA. + * User: chartl + */ +public class RFWindow { + + private RFAArgumentCollection argumentCollection; // the RF Argument Collection from the RF Walker + private List windowStarts; // holds the starting positions of the windows + private Map sampleCohortMap; // identifies case samples ("true") and control samples ("false") + List aggregators; // read feature aggregators to be used (maintained for fast cloning, and ordering) + // feature ---> sample ---> count + List>> aggregatorWindows; // holds the map between samples and features in the current windows + private GenomeLocParser parser; // the active parser, for creating GenomeLocs for empty windows + private GenomeLoc previousLoc; // the previous genome loc within the user interval given to the RFWindow + + /** + * Defines a new window which maps samples to read feature aggregators + * @return - a new element for aggregatorWindows + */ + private Map> newWindow() { + Map> win = new HashMap>(sampleCohortMap.size()); + for ( String s : sampleCohortMap.keySet() ) { + win.put(s,getAggregators()); + } + // todo -- generalize me + win.put("case",getAggregators()); + win.put("control",getAggregators()); + + return win; + } + + /** + * Generates a list of new aggregators to collect data + * @return list of ReadFeatureAggregators to be the value of a new window + */ + private List getAggregators() { + ArrayList newEmptyAgs = new ArrayList(aggregators.size()); + try { + for ( ReadFeatureAggregator ag : aggregators ) { + newEmptyAgs.add(ag.getClass().getConstructor(RFAArgumentCollection.class).newInstance(argumentCollection)); + } + } catch (Exception e) { + throw new StingException("Error instantiating read feature aggregator",e); + } + + return newEmptyAgs; + } + + /** + * A constructor for the RFWindow object + * @param collection - the argument collection, from the walker + * @param cohortMap - the map between sample IDs and whether or not it is a case sample, from the walker + * @param parser - the Genome Loc Parser, from the walker + */ + public RFWindow(List aggregators, RFAArgumentCollection collection, Map cohortMap, GenomeLocParser parser) { + this.argumentCollection = collection; + this.sampleCohortMap = cohortMap; + this.parser = parser; + this.aggregators = aggregators; + } + + /** + * Instantiate the tiled windows of sample -> List maps, filling in empty windows up to the one which contains the + * provided genomeloc if necessary. + * @param loc - the location to fill up to, usually the starting position of a read or an interval + * @param currentUserInterval - the current user-provided interval being processed by the walker + */ + @Requires({"! currentUserInterval.isBefore(loc)"}) + public void instantiate(GenomeLoc loc, GenomeLoc currentUserInterval) { + aggregatorWindows = new ArrayList>>(argumentCollection.windowSize/argumentCollection.windowJump); + windowStarts = new ArrayList(argumentCollection.windowSize/argumentCollection.windowJump); + this.fill(loc, currentUserInterval); + } + + /** + * Fills the tiled window lists with empty windows up to the one which includes the locus + * todo -- this can take a lot of memory for large intervals instantiated far into them, e.g. + * |---------------------------------------------------------------------| interval + * . <=== locus + * todo -- perhaps a reduced representation for empty windows that were not created but already have expired? + * @param loc - the location to fill up to, usually the starting position of a read or an interval + * @param currentUserInterval - the current user-provided interval being processed by the walker + */ + @Requires({"! currentUserInterval.isBefore(loc)"}) + public void fill(GenomeLoc loc, GenomeLoc currentUserInterval) { + // case 0: if the windows are empty add in the beginning of the interval + if ( windowStarts.size() == 0 ) { + windowStarts.add(currentUserInterval.getStartLocation()); + aggregatorWindows.add(newWindow()); + } + + // case 1: loc is before or within windowJump bases of the current user interval; we need only instantiate the first window + if ( loc.isBefore(currentUserInterval) || loc.distance(currentUserInterval) <= argumentCollection.windowJump ) { + // do nothing at all + } else { + // case 2: loc is somewhere in the middle or at the end of current user interval, need to fill in windows up until then + GenomeLoc nextLoc = shiftLocByJump(windowStarts.get(windowStarts.size()-1),argumentCollection.windowJump); + while ( loc.distance(nextLoc) > argumentCollection.windowJump ) { + nextLoc = shiftLocByJump(windowStarts.get(windowStarts.size()-1),argumentCollection.windowJump); + windowStarts.add(nextLoc); + aggregatorWindows.add(newWindow()); + } + } + + } + + /** + * Shifts a location from chrZ:X to chrZ:X+jump + * @param loc - location to shift + * @param jump - amount to shift by + * @return loc with start position shifted by jump + */ + @Requires("loc.getStart()+jump < parser.getContigInfo(loc.getContig()).getSequenceLength()") // e.g. that the new loc is not off the end of the contig + private GenomeLoc shiftLocByJump(GenomeLoc loc, int jump) { + return parser.createGenomeLoc(loc.getContig(),loc.getStart()+jump); + } + + /** + * Fills in missing windows between the previously-seen one and the ending interval, then expires all windows, returning + * them as complete, resetting previousLocation to null so that the next read re-instantiates internal window data + * @param userIntervalEnding - the user interval which is ending + * @return those currently active windows, plus empty interpolating windows between the last active window and the end of the interval + */ + public List>>> flush(GenomeLoc userIntervalEnding) { + // jump in locations -- flush the windows + List>>> complete = new ArrayList>>>(aggregators.size()); + // fill in any uncovered windows + GenomeLoc iStop = userIntervalEnding.getStopLocation(); + fill(iStop,userIntervalEnding); + // now expire all windows, terminating them either at the proper window size, or at the endpoint of the interval if it comes sooner + while ( windowStarts.size() > 0 ) { + Map> cMap = aggregatorWindows.remove(0); + GenomeLoc wsLoc = windowStarts.remove(0); + + GenomeLoc cLoc; + if ( wsLoc.distance(iStop) > argumentCollection.windowSize ) { + cLoc = parser.createGenomeLoc(wsLoc.getContig(),wsLoc.getStart(),wsLoc.getStart()+argumentCollection.windowSize); + } else { + cLoc = wsLoc.endpointSpan(iStop); + } + + complete.add(new Pair>>(cLoc,cMap)); + } + + previousLoc = null; // will re-instantiate data upon next loc + + return complete; + } + + /** + * Workhorse method: + * - determines if new windows need be made + * - determines if old windows need be tested & trashed + * - determines which windows need to be updated & updates them + * + * @param loc - starting alignment position of the read + * @param sample - the sample ID from whom the read was sequenced + * @return - those feature window(s) that need be tested (and then baleeted) + */ + public List>>> inc(SAMRecord record, GenomeLoc loc, String sample, GenomeLoc userInterval) { + List>>> complete = new ArrayList>>>(aggregators.size()); + if ( previousLoc == null ) { + // first time, gotta instantiate stuff + instantiate(loc,userInterval); + windowInc(sample,record,aggregatorWindows.get(0)); + } else if ( loc.distance(previousLoc) == 0 ) { + // best and most common case: just update the living windows + for ( Map> win : aggregatorWindows ) { + windowInc(sample,record,win); + } + } else { + // another standard case: we've gone to some further base in the same interval + while ( loc.distance(windowStarts.get(windowStarts.size()-1)) > argumentCollection.windowJump ) { + // careful, don't use the location itself, but add in windows every winJump bases until this condition is not met + windowStarts.add(shiftLocByJump(windowStarts.get(windowStarts.size()-1),argumentCollection.windowJump)); + aggregatorWindows.add(newWindow()); + } + + while ( windowStarts.size() > 0 && loc.distance(windowStarts.get(0)) > argumentCollection.windowSize ) { + Map> cMap = aggregatorWindows.remove(0); + GenomeLoc cLoc = windowStarts.remove(0).endpointSpan(previousLoc); + complete.add(new Pair>>(cLoc,cMap)); + } + + for ( Map> win : aggregatorWindows ) { + windowInc(sample,record,win); + } + + } + + previousLoc = loc; + + return complete; + } + + /** + * Incorporate new features into the window + * @param sample - id of sample from which the features come + * @param record - the read + * @param window - the particular window to be updated + */ + private void windowInc(String sample, SAMRecord record, Map> window) { + if ( sample == null || record == null ) { return; } + + for (ReadFeatureAggregator aggregator : window.get(sample) ) { + aggregator.aggregate(record); + } + + for ( ReadFeatureAggregator aggregator : window.get( sampleCohortMap.get(sample) ? "case" : "control") ) { + aggregator.aggregate(record); + } + } + +} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/ReadFeatureWindow.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/ReadFeatureWindow.java new file mode 100755 index 000000000..b357dec9b --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/ReadFeatureWindow.java @@ -0,0 +1,152 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.newassociation; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features.ReadFeatureAggregator; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.StingException; + +import java.util.*; + +/** + * Workhorse class of read feature association: maintains the active windows, culls the inactive ones and passes + * them back for testing. Windows consist of aggregators, and list indeces link windows to their starting loc which + * determines their active/inactive status. + */ +public class ReadFeatureWindow { + RFAArgumentCollection args; + private List winStarts; + private GenomeLoc previousLoc; + private int winSize; + private int winJump; + private GenomeLocParser parser; + + private Map sampleCohortMap; + List aggregators; + // feature ---> sample ---> count + List>> aggregatorWindows; + + public ReadFeatureWindow(List aggregators, Map cohortMap, RFAArgumentCollection args, GenomeLocParser parser) { + winSize = args.windowSize; + winJump = args.windowJump; + sampleCohortMap = cohortMap; + this.args = args; + this.aggregators =aggregators; + this.parser = parser; + } + + public void instantiate(GenomeLoc loc) { + aggregatorWindows = new ArrayList>>(winSize/winJump); + winStarts = new ArrayList(winSize/winJump); + winStarts.add(loc); + aggregatorWindows.add(newWindow()); + } + + /** + * Workhorse method: + * - determines if new windows need be made + * - determines if old windows need be tested & trashed + * - determines which windows need to be updated & updates them + * + * @param loc - starting alignment position of the read + * @param sample - the sample ID from whom the read was sequenced + * @return - those feature window(s) that need be tested (and then baleeted) + */ + public List>>> inc(SAMRecord record, GenomeLoc loc, String sample) { + List>>> complete = new ArrayList>>>(aggregators.size()); + if ( previousLoc == null ) { + // first time, gotta instantiate stuff + instantiate(loc); + windowInc(sample,record,aggregatorWindows.get(0)); + } else if ( loc.distance(previousLoc) == 0 ) { + // best and most common case: just update the living windows + for ( Map> win : aggregatorWindows ) { + windowInc(sample,record,win); + } + } else { + // another standard case: we've gone to some further base in the same interval + while ( loc.distance(winStarts.get(winStarts.size()-1)) > winJump ) { + // careful, don't use the location itself, but add in windows every winJump bases until this condition is not met + winStarts.add(shiftLocByJump(winStarts.get(winStarts.size()-1),winJump)); + aggregatorWindows.add(newWindow()); + } + + while ( winStarts.size() > 0 && loc.distance(winStarts.get(0)) > winSize ) { + Map> cMap = aggregatorWindows.remove(0); + GenomeLoc cLoc = winStarts.remove(0).endpointSpan(previousLoc); + complete.add(new Pair>>(cLoc,cMap)); + } + + for ( Map> win : aggregatorWindows ) { + windowInc(sample,record,win); + } + + } + + previousLoc = loc; + + return complete; + } + + public List>>> flush() { + // jump in locations -- flush the windows + List>>> complete = new ArrayList>>>(aggregators.size()); + while ( winStarts.size() > 0 ) { + Map> cMap = aggregatorWindows.remove(0); + GenomeLoc cLoc = winStarts.remove(0).endpointSpan(previousLoc); + complete.add(new Pair>>(cLoc,cMap)); + } + + previousLoc = null; // will re-instantiate data upon next loc + + return complete; + } + + private Map> newWindow() { + Map> win = new HashMap>(sampleCohortMap.size()); + for ( String s : sampleCohortMap.keySet() ) { + win.put(s,getAggregators()); + } + // todo -- generalize me + win.put("case",getAggregators()); + win.put("control",getAggregators()); + + return win; + } + + private List getAggregators() { + ArrayList newEmptyAgs = new ArrayList(aggregators.size()); + try { + for ( ReadFeatureAggregator ag : aggregators ) { + newEmptyAgs.add(ag.getClass().getConstructor(RFAArgumentCollection.class).newInstance(args)); + } + } catch (Exception e) { + throw new StingException("Error instantiating read feature aggregator",e); + } + + return newEmptyAgs; + } + + /** + * Incorporate new features into the window + * @param sample - id of sample from which the features come + * @param record - the read + * @param window - the particular window to be updated + */ + private void windowInc(String sample, SAMRecord record, Map> window) { + if ( sample == null || record == null ) { return; } + + for (ReadFeatureAggregator aggregator : window.get(sample) ) { + aggregator.aggregate(record); + } + + for ( ReadFeatureAggregator aggregator : window.get( sampleCohortMap.get(sample) ? "case" : "control") ) { + aggregator.aggregate(record); + } + } + + private GenomeLoc shiftLocByJump(GenomeLoc loc, int jump) { + return parser.createGenomeLoc(loc.getContig(),loc.getStart()+jump); + } +} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/AberrantInsertSize.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/AberrantInsertSize.java new file mode 100755 index 000000000..8172ab5bb --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/AberrantInsertSize.java @@ -0,0 +1,31 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.RFAArgumentCollection; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: 5/4/11 + * Time: 1:09 PM + * To change this template use File | Settings | File Templates. + */ +public class AberrantInsertSize extends BinaryFeatureAggregator { + + private int min; + private int max; + + public AberrantInsertSize(RFAArgumentCollection col) { + super(col); + min = col.lowInsertSize; + max = col.highInsertSize; + } + + public Boolean extractFeature(SAMRecord rec) { + return Math.abs(rec.getInferredInsertSize()) > max || Math.abs(rec.getInferredInsertSize()) < min; + } + + public boolean featureDefined(SAMRecord rec) { + return rec.getReadPairedFlag() && rec.getProperPairFlag(); + } +} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/BinaryClippedBases.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/BinaryClippedBases.java new file mode 100755 index 000000000..7a52013f3 --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/BinaryClippedBases.java @@ -0,0 +1,48 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features; + +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.RFAArgumentCollection; +import org.broadinstitute.sting.utils.sam.ReadUtils; + +/** + * Created by IntelliJ IDEA. + * User: Ghost + * Date: 5/19/11 + * Time: 12:08 AM + * To change this template use File | Settings | File Templates. + */ +public class BinaryClippedBases extends BinaryFeatureAggregator { + + private short baseLim; + private final byte baseQualLim = 20; + + public Boolean extractFeature(SAMRecord read) { + int firstClippedToAliStart = read.getUnclippedStart()-read.getAlignmentStart(); + int lastUnclippedToReadEnd = read.getUnclippedEnd()-read.getAlignmentEnd(); + + byte[] quals = read.getBaseQualities(); + int nClipped = 0; + for ( int offset = 0; offset < firstClippedToAliStart; offset++ ) { + if ( quals[offset] >= baseQualLim ) { + nClipped++; + } + } + + for ( int offset = quals.length - lastUnclippedToReadEnd; offset < quals.length ; offset++ ) { + if ( quals[offset] >= baseQualLim ) { + nClipped ++; + } + } + + return nClipped >= baseLim; + } + + public boolean featureDefined(SAMRecord rec) { return ! rec.getReadPairedFlag() || Math.abs(rec.getInferredInsertSize()) > 100; } // unpaired or no adaptor sequence + + public BinaryClippedBases(RFAArgumentCollection col) { + super(col); + baseLim = col.clippedBases; + } +} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/BinaryFeatureAggregator.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/BinaryFeatureAggregator.java new file mode 100755 index 000000000..3c515db1d --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/BinaryFeatureAggregator.java @@ -0,0 +1,24 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.RFAArgumentCollection; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: 5/4/11 + * Time: 12:52 PM + * To change this template use File | Settings | File Templates. + */ +public abstract class BinaryFeatureAggregator extends ReadFeatureAggregator { + + public BinaryFeatureAggregator(RFAArgumentCollection col) { + super(col); + } + + public void aggregate(Boolean hasFeature) { + // now robustified + mean = ( (hasFeature ? 1 : 0)+nReads*mean)/(++nReads); + var = mean*(1-mean) + Math.pow(2,1-nReads); + } +} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/BinaryFeatureAggregator.java~ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/BinaryFeatureAggregator.java~ new file mode 100755 index 000000000..86103f01a --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/BinaryFeatureAggregator.java~ @@ -0,0 +1,24 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.RFAArgumentCollection; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: 5/4/11 + * Time: 12:52 PM + * To change this template use File | Settings | File Templates. + */ +public abstract class BinaryFeatureAggregator extends ReadFeatureAggregator { + + public BinaryFeatureAggregator(RFAArgumentCollection col) { + super(col); + } + + public void aggregate(Boolean hasFeature) { + // now robustified + mean = ( 1+(hasFeature ? 1 : 0)+nReads*mean)/(++nReads+1); + var = mean*(1-mean); + } +} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/ClippedBases.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/ClippedBases.java new file mode 100755 index 000000000..dc3f23793 --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/ClippedBases.java @@ -0,0 +1,35 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features; + +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.RFAArgumentCollection; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: 5/4/11 + * Time: 1:33 PM + * To change this template use File | Settings | File Templates. + */ +public class ClippedBases { + // todo -- make a binary feature version of this + + public Integer extractFeature(SAMRecord record) { + int nClipped = 0; + + for ( CigarElement e : record.getCigar().getCigarElements() ) { + if ( e.getOperator().equals(CigarOperator.SOFT_CLIP) || e.getOperator().equals(CigarOperator.HARD_CLIP) ) { + nClipped += e.getLength(); + } + } + + return nClipped; + } + + public boolean featureDefined(SAMRecord rec) { return true; } + + public ClippedBases(RFAArgumentCollection col) { + //super(col); + } +} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/InsertSize.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/InsertSize.java new file mode 100755 index 000000000..c8d8fcce6 --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/InsertSize.java @@ -0,0 +1,27 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.RFAArgumentCollection; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: 5/4/11 + * Time: 12:58 PM + * To change this template use File | Settings | File Templates. + */ +public class InsertSize { + // todo -- this is deprecated by AIS, so the extension is removed. + + public InsertSize(RFAArgumentCollection col) { + //super(col); + } + + protected Integer extractFeature(SAMRecord record) { + return Math.abs(record.getInferredInsertSize()); + } + + protected boolean featureDefined(SAMRecord record) { + return record.getReadPairedFlag() && record.getProperPairFlag(); + } +} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/MateOtherContig.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/MateOtherContig.java new file mode 100755 index 000000000..7e6c2e1c4 --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/MateOtherContig.java @@ -0,0 +1,26 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.RFAArgumentCollection; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: 5/4/11 + * Time: 1:06 PM + * To change this template use File | Settings | File Templates. + */ +public class MateOtherContig extends BinaryFeatureAggregator { + + public MateOtherContig(RFAArgumentCollection col) { + super(col); + } + + public Boolean extractFeature(SAMRecord record) { + return ! record.getReferenceName().equals(record.getMateReferenceName()); + } + + public boolean featureDefined(SAMRecord read) { + return read.getReadPairedFlag() && ! read.getMateUnmappedFlag(); + } +} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/MateSameStrand.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/MateSameStrand.java new file mode 100755 index 000000000..9b9a6a1fd --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/MateSameStrand.java @@ -0,0 +1,27 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.RFAArgumentCollection; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: 5/4/11 + * Time: 1:15 PM + * To change this template use File | Settings | File Templates. + */ +public class MateSameStrand extends BinaryFeatureAggregator { + + public Boolean extractFeature(SAMRecord record) { + return record.getReadNegativeStrandFlag() == record.getMateNegativeStrandFlag(); + } + + public boolean featureDefined(SAMRecord record) { + return record.getReadPairedFlag() && ! record.getMateUnmappedFlag() && record.getReferenceIndex().equals(record.getMateReferenceIndex()); + } + + + public MateSameStrand(RFAArgumentCollection col) { + super(col); + } +} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/MateUnmapped.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/MateUnmapped.java new file mode 100755 index 000000000..bd9fcb1f3 --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/MateUnmapped.java @@ -0,0 +1,26 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.RFAArgumentCollection; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: 5/4/11 + * Time: 1:32 PM + * To change this template use File | Settings | File Templates. + */ +public class MateUnmapped extends BinaryFeatureAggregator { + + public Boolean extractFeature(SAMRecord record) { + return record.getMateUnmappedFlag(); + } + + public boolean featureDefined(SAMRecord record) { + return record.getReadPairedFlag(); + } + + public MateUnmapped(RFAArgumentCollection col) { + super(col); + } +} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/NumericFeatureAggregator.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/NumericFeatureAggregator.java new file mode 100755 index 000000000..dfd5fc089 --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/NumericFeatureAggregator.java @@ -0,0 +1,50 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.RFAArgumentCollection; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: 5/4/11 + * Time: 12:39 PM + * To change this template use File | Settings | File Templates. + */ +public abstract class NumericFeatureAggregator extends ReadFeatureAggregator { + + private int min; + private int max; + + public NumericFeatureAggregator(RFAArgumentCollection col) { + super(col); + min = -1; + max = -1; + } + + protected void aggregate(Integer datum) { + if ( min == -1 ) { + min = datum; + } else if ( max == -1 ) { + if ( datum > min ) { + max = datum; + } else { + max = min; + min = datum; + } + } else if ( datum > max ) { + update(max); + max = datum; + } else if ( datum < min ) { + update(min); + min = datum; + } else { + update(datum); + } + } + + protected void update(Integer datum) { + double oldMean = mean; + mean += (datum - mean)/(1+nReads); + var = ((nReads*var) + (datum - oldMean)*(datum-mean))/++nReads; + } +} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/ProperPair.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/ProperPair.java new file mode 100755 index 000000000..ef1a1d436 --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/ProperPair.java @@ -0,0 +1,26 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.RFAArgumentCollection; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: 6/8/11 + * Time: 11:59 AM + * To change this template use File | Settings | File Templates. + */ +public class ProperPair extends BinaryFeatureAggregator { + + public ProperPair(RFAArgumentCollection collection) { + super(collection); + } + + public Boolean extractFeature(SAMRecord record) { + return record.getProperPairFlag(); + } + + public boolean featureDefined(SAMRecord record) { + return record.getReadPairedFlag(); + } +} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/ReadFeatureAggregator.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/ReadFeatureAggregator.java new file mode 100755 index 000000000..4bf167657 --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/newassociation/features/ReadFeatureAggregator.java @@ -0,0 +1,61 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.RFAArgumentCollection; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: 5/4/11 + * Time: 12:33 PM + * To change this template use File | Settings | File Templates. + */ +public abstract class ReadFeatureAggregator { + + protected double mean; + protected double var; + protected int nReads; + + public ReadFeatureAggregator(RFAArgumentCollection collection) { + init(collection); + mean = 0.0; + var = 0.0; + nReads = 0; + } + + public void aggregate(SAMRecord record) { + if ( featureDefined(record) ) { + aggregate(extractFeature(record)); + } + } + + protected abstract void aggregate(X feature); + + protected abstract boolean featureDefined(SAMRecord record); + + protected abstract X extractFeature(SAMRecord record); + + public double getMean() { return mean; } + public double getVar() { return var; } + public double getUnbiasedVar() { return var*( (double) nReads)/(nReads-1); } + public int getnReads() { return nReads; } + + public void init(RFAArgumentCollection collection) { } + + public X parse(SAMRecord read) { + if ( featureDefined(read) ) { + return extractFeature(read); + } else { + return null; + } + } + + public String parseStr(SAMRecord read) { + if ( featureDefined(read) ) { + return extractFeature(read).toString(); + } else { + return "undefined"; + } + } + +} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/validation/PickSequenomProbes2.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/validation/PickSequenomProbes2.java new file mode 100755 index 000000000..140b82350 --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/validation/PickSequenomProbes2.java @@ -0,0 +1,169 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.validation; + +import org.broad.tribble.util.variantcontext.VariantContext; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.features.table.TableFeature; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.GenomeLoc; + +import java.io.PrintStream; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: 6/13/11 + * Time: 2:12 PM + * To change this template use File | Settings | File Templates. + */ +@Requires(value={DataSource.REFERENCE}, referenceMetaData={@RMD(name="ProbeIntervals",type=TableFeature.class), +@RMD(name="ValidateAlleles",type=VariantContext.class),@RMD(name="MaskAlleles",type=VariantContext.class)}) +public class PickSequenomProbes2 extends RodWalker { + + @Output + PrintStream out; + + GenomeLoc prevInterval; + GenomeLoc allelePos; + String probeName; + StringBuilder sequence; + boolean sequenceInvalid; + + public Integer reduceInit() { + prevInterval = null; + sequence = null; + sequenceInvalid = false; + probeName = null; + return 0; + } + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( tracker == null || ! tracker.hasROD("ProbeIntervals")) { return null; } + + GenomeLoc interval = ((TableFeature) tracker.getReferenceMetaData("ProbeIntervals",true).get(0)).getLocation(); + if ( prevInterval == null || ! interval.equals(prevInterval) ) { + // we're in a new interval, we should: + // 1) print out previous data + // 2) reset internal data + // 3) instantiate traversal of this interval + + // step 1: + if ( prevInterval != null ) { + // there was a previous interval + validateSequence(); // ensure the sequence in the region is valid + lowerRepeats(); // change repeats in sequence to lower case + print(); // print out the fasta sequence + } + + // step 2: + prevInterval = interval; + allelePos = null; + sequence = new StringBuilder(); + sequenceInvalid = false; + probeName = ((TableFeature) tracker.getReferenceMetaData("ProbeIntervals",true).get(0)).getAllValues().get(1); + } + + // step 3 (or 1 if not new): + // build up the sequence + + VariantContext mask = tracker.getVariantContext(ref,"MaskAlleles",ref.getLocus()); + VariantContext validate = tracker.getVariantContext(ref,"ValidateAlleles",ref.getLocus()); + + if ( mask == null && validate == null ) { + sequence.append(Character.toUpperCase((char) ref.getBase())); + } else if ( validate != null ) { + // doesn't matter if there's a mask here too -- this is what we want to validate + sequence.append('['); + sequence.append(validate.getAlternateAllele(0).toString()); + sequence.append('/'); + sequence.append(validate.getReference().toString()); + sequence.append(']'); + allelePos = ref.getLocus(); + } else /* (mask != null && validate == null ) */ { + if ( ! mask.isSNP() ) { + logger.warn("Mask Variant Context on the following warning line is not a SNP. Currently we can only mask out SNPs. This probe will not be designed."); + logger.warn(mask.toString()); + sequenceInvalid = true; + sequence.append((char) ref.getBase()); + } else { + sequence.append((char) BaseUtils.N); + } + } + + return 1; + } + + public Integer reduce(Integer i, Integer j) { + return 0; + } + + public void validateSequence() { + // code for ensuring primer sequence is valid goes here + + // validate that there are no masked sites near to the variant site + String seq = sequence.toString(); + int start = seq.indexOf('[') - 4; + int end = seq.indexOf(']') + 5; + boolean maskNearVariantSite = false; + for ( int i = start; i < end; i++ ) { + maskNearVariantSite |= (seq.charAt(i) == 'N'); + } + + if ( maskNearVariantSite ) { + logger.warn("There is one (or more) mask variants within 4 basepair of the variant given for validation. This site will not be designed."); + sequenceInvalid = true; + } + + if ( seq.indexOf("[") != seq.lastIndexOf("[") ) { + logger.warn("Multiple probe variants were found within this interval. Please fix the definitions of the intervals so they do not overlap."); + sequenceInvalid = true; + } + } + + public void lowerRepeats() { + // convert to lower case low-complexity repeats, e.g. tandem k-mers + final int K_LIM = 6; + String seq = sequence.toString(); + StringBuilder newSequence = new StringBuilder(); + int start_pos = 0; + while( start_pos < seq.length() ) { + boolean broke = false; + for ( int length = K_LIM; length > 1; length -- ) { + if ( equalsIgnoreNs(seq.substring(start_pos,start_pos+length),seq.substring(start_pos+length,start_pos+2*length)) ) { + newSequence.append(seq.substring(start_pos,start_pos+2*length).toLowerCase()); + start_pos += length; + broke = true; + break; + } + } + + if ( ! broke ) { + newSequence.append(seq.substring(start_pos,start_pos+1)); + start_pos++; + } + + } + + sequence = newSequence; + } + + public boolean equalsIgnoreNs(String one, String two) { + if ( one.length() != two.length() ) { return false; } + for ( int idx = 0; idx < one.length(); idx++ ) { + if ( one.charAt(idx) != two.charAt(idx) ) { + if ( one.charAt(idx) != 'N' && two.charAt(idx) != 'N' ) { + return false; + } + } + } + + return true; + } + + public void print() { + out.printf("%s\t%s\t%s%n",allelePos.toString(),probeName,sequence.toString()); + } +} diff --git a/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index a8b774986..1f8800542 100644 --- a/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -72,6 +72,10 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome @Ensures("result != null") public final GenomeLoc getLocation() { return this; } + public final GenomeLoc getStartLocation() { return new GenomeLoc(getContig(),getContigIndex(),getStart(),getStart()); } + + public final GenomeLoc getStopLocation() { return new GenomeLoc(getContig(),getContigIndex(),getStop(),getStop()); } + /** * @return the name of the contig of this GenomeLoc */ @@ -142,6 +146,24 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome Math.max( getStop(), that.getStop()) ); } + /** + * Returns a new GenomeLoc that represents the region between the endpoints of this and that. Requires that + * this and that GenomeLoc are both mapped. + */ + @Requires({"that != null", "isUnmapped(this) == isUnmapped(that)"}) + @Ensures("result != null") + public GenomeLoc endpointSpan(GenomeLoc that) throws ReviewedStingException { + if(GenomeLoc.isUnmapped(this) || GenomeLoc.isUnmapped(that)) { + throw new ReviewedStingException("Cannot get endpoint span for unmerged genome locs"); + } + + if ( ! this.getContig().equals(that.getContig()) ) { + throw new ReviewedStingException("Cannot get endpoint span for genome locs on different contigs"); + } + + return new GenomeLoc(getContig(),this.contigIndex,Math.min(getStart(),that.getStart()),Math.max(getStop(),that.getStop())); + } + /** * Splits the contig into to regions: [start,split point) and [split point, end]. * @param splitPoint The point at which to split the contig. Must be contained in the given interval. @@ -330,6 +352,11 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome return result; } + @Requires("that != null") + public boolean endsAt(GenomeLoc that) { + return (this.compareContigs(that) == 0) && ( this.getStop() == that.getStop() ); + } + /** * How many BPs are covered by this locus? * @return Number of BPs covered by this locus. According to the semantics of GenomeLoc, this should diff --git a/scala/qscript/oneoffs/chartl/ExomeVQSR.q b/scala/qscript/oneoffs/chartl/ExomeVQSR.q new file mode 100755 index 000000000..21d342a99 --- /dev/null +++ b/scala/qscript/oneoffs/chartl/ExomeVQSR.q @@ -0,0 +1,219 @@ +import org.broadinstitute.sting.commandline.ArgumentSource +import org.broadinstitute.sting.datasources.pipeline.Pipeline +import org.broadinstitute.sting.queue.extensions.gatk._ +import org.broadinstitute.sting.queue.function.ListWriterFunction +import org.broadinstitute.sting.queue.function.scattergather.{GatherFunction, CloneFunction, ScatterFunction} +import org.broadinstitute.sting.queue.library.ipf.intervals.ExpandIntervals +import org.broadinstitute.sting.queue.QScript +import collection.JavaConversions._ +import org.broadinstitute.sting.utils.text.XReadLines + +class FullCallingPipeline extends QScript { + qscript => + + @Input(doc="path to GATK jar", shortName="G") + var gatkJar: File = _ + + @Input(doc="level of parallelism for UnifiedGenotyper (both for SNPs and indels). By default is set to 20.", shortName="varScatter", required=false) + var num_var_scatter_jobs = 20 + + @Argument(doc="expand each target in input intervals by the specified number of bases (50 bases by default)", shortName="expand", required=false) + var expandIntervals = 50 + + private var pipeline: Pipeline = _ + + private final val picardFixMatesClass = "net.sf.picard.sam.FixMateInformation" + + + val BAM_FILES : List[File] = (new XReadLines(new File("/humgen/gsa-hphome1/chartl/projects/oneoffs/VQSR_Exome/resources/broad.bam.list"))).readLines.map(u => new File(u)).toList + val DBSNP : File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_b37.vcf") + val REF : File = new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta") + val INTS : File = new File("/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list") + val BASE : String = "exon_vqsr" + + val handFiltered : File = new File("/humgen/1kg/exomes/results/broad.wex.96samples/v1/1KGBroadWEx.variants.vcf") + + + trait CommandLineGATKArgs extends CommandLineGATK { + this.intervals :+= INTS + this.jarFile = qscript.gatkJar + this.reference_sequence = REF + this.memoryLimit = Some(4) + } + // ------------ SETUP THE PIPELINE ----------- // + + + def script = { + + endToEnd(BASE,"cleaned") + } + + def endToEnd(base: String, bamType: String) = { + val bamFiles = BAM_FILES + + val ei : ExpandIntervals = new ExpandIntervals(INTS,1,qscript.expandIntervals, new File("Resources", base + ".flanks.interval_list"), REF, "INTERVALS", "INTERVALS") + ei.jobOutputFile = new File(".queue/logs/Overall/ExpandIntervals.out") + + if (qscript.expandIntervals > 0) { + //add(ei) + } + + trait ExpandedIntervals extends CommandLineGATK { + if (qscript.expandIntervals > 0) { + this.intervals :+= ei.outList + } + } + + // Call indels + val indels = new UnifiedGenotyper with CommandLineGATKArgs with ExpandedIntervals + indels.analysisName = base + "_indels" + indels.jobOutputFile = new File(".queue/logs/IndelCalling/UnifiedGenotyper.indels.out") + indels.memoryLimit = Some(6) + indels.downsample_to_coverage = Some(600) + indels.genotype_likelihoods_model = org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.Model.DINDEL + indels.input_file = bamFiles + indels.rodBind :+= RodBind("dbsnp", "vcf", DBSNP) + indels.out = new File("IndelCalls", base+".indels.vcf") + + indels.scatterCount = qscript.num_var_scatter_jobs + indels.setupScatterFunction = { + case scatter: ScatterFunction => + scatter.commandDirectory = new File("IndelCalls/ScatterGather") + scatter.jobOutputFile = new File(".queue/logs/IndelCalling/ScatterGather/Scatter.out") + } + indels.setupCloneFunction = { + case (clone: CloneFunction, index: Int) => + clone.commandDirectory = new File("IndelCalls/ScatterGather/Scatter_%s".format(index)) + clone.jobOutputFile = new File(".queue/logs/IndelCalling/ScatterGather/Scatter_%s.out".format(index)) + } + indels.setupGatherFunction = { + case (gather: GatherFunction, source: ArgumentSource) => + gather.commandDirectory = new File("IndelCalls/ScatterGather/Gather_%s".format(source.field.getName)) + gather.jobOutputFile = new File(".queue/logs/IndelCalling/ScatterGather/Gather_%s.out".format(source.field.getName)) + } + + // Filter indels + val filteredIndels = new VariantFiltration with CommandLineGATKArgs with ExpandedIntervals + filteredIndels.analysisName = base + "_filteredIndels" + filteredIndels.jobOutputFile = new File(".queue/logs/IndelCalling/VariantFiltration.indels.out") + filteredIndels.filterName ++= List("IndelQUALFilter","IndelSBFilter","IndelQDFilter") + filteredIndels.filterExpression ++= List("\"QUAL<30.0\"","\"SB>-1.0\"","\"QD<2\"") + filteredIndels.variantVCF = indels.out + filteredIndels.out = swapExt("IndelCalls", indels.out, ".vcf",".filtered.vcf") + + // Call snps + val snps = new UnifiedGenotyper with CommandLineGATKArgs with ExpandedIntervals + snps.analysisName = base+"_snps" + snps.jobOutputFile = new File(".queue/logs/SNPCalling/UnifiedGenotyper.snps.out") + snps.memoryLimit = Some(6) + snps.downsample_to_coverage = Some(600) + snps.input_file = bamFiles + snps.rodBind :+= RodBind("dbsnp", "vcf", DBSNP) + snps.out = new File("SnpCalls", base+".snps.vcf") + + snps.scatterCount = qscript.num_var_scatter_jobs + snps.setupScatterFunction = { + case scatter: ScatterFunction => + scatter.commandDirectory = new File("SnpCalls/ScatterGather") + scatter.jobOutputFile = new File(".queue/logs/SNPCalling/ScatterGather/Scatter.out") + } + snps.setupCloneFunction = { + case (clone: CloneFunction, index: Int) => + clone.commandDirectory = new File("SnpCalls/ScatterGather/Scatter_%s".format(index)) + clone.jobOutputFile = new File(".queue/logs/SNPCalling/ScatterGather/Scatter_%s.out".format(index)) + } + snps.setupGatherFunction = { + case (gather: GatherFunction, source: ArgumentSource) => + gather.commandDirectory = new File("SnpCalls/ScatterGather/Gather_%s".format(source.field.getName)) + gather.jobOutputFile = new File(".queue/logs/SNPCalling/ScatterGather/Gather_%s.out".format(source.field.getName)) + } + + // Filter snps at indels + val filteredSNPs = new VariantFiltration with CommandLineGATKArgs with ExpandedIntervals + filteredSNPs.analysisName = base+"_filteredSNPs" + filteredSNPs.jobOutputFile = new File(".queue/logs/SNPCalling/VariantFiltration.snps.out") + filteredSNPs.clusterWindowSize = Some(10) + filteredSNPs.clusterSize = Some(3) + filteredSNPs.rodBind :+= RodBind("mask", "VCF", filteredIndels.out) + filteredSNPs.variantVCF = snps.out + filteredSNPs.out = swapExt("SnpCalls",snps.out,".vcf",".filtered.vcf") + + // Mako de Clusters + val cr = new ContrastiveRecalibrator with CommandLineGATKArgs with ExpandedIntervals + cr.rodBind :+= new RodBind("input","vcf",filteredSNPs.out) + cr.rodBind :+= new RodBind("dbsnp","vcf",DBSNP,"known=true,training=false,truth=false,prior=8.0") + cr.rodBind :+= new RodBind("hapmap","vcf", new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf"),"known=false,training=true,truth=true,prior=15.0") + cr.rodBind :+= new RodBind("omni","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/1212samples.b37.vcf"),"known=false,training=true,truth=true,prior=12.0") + cr.allPoly = true + cr.use_annotation ++= List("HaplotypeScore","SB","QD","HRun") + cr.tranches_file = new File(base+".tranche") + cr.recal_file = new File(base+".contrastive.recal.table") + cr.tranche ++= List("99.9","99.5","99.25","98.0","97.75","97.65","97.5","97.3","97.2","97.1","98.0","97.5","97.0","96.75","96.5","96.0","95.5","95.0","94.75","94.5","94.25","94.0", + "93.75","93.5","93.25","93.0","92.75","92.5","92.25","92.0","91.0","90.0") + cr.analysisName = base+"_ContrastiveRecalibrator" + cr.memoryLimit = Some(32) + cr.num_threads = Some(6) + + + // Apply the Recalibration + val ar = new ApplyRecalibration with CommandLineGATKArgs with ExpandedIntervals + ar.rodBind :+= new RodBind("input","vcf",filteredSNPs.out) + ar.tranches_file = cr.tranches_file + ar.recal_file = cr.recal_file + ar.ts_filter_level = Some(91.75) + ar.out = new File(base+"_contrastive_recal.91.75.vcf") + ar.memoryLimit = Some(6) + + // Variant eval the standard region + val stdEval = new VariantEval with CommandLineGATKArgs + stdEval.analysisName = base+"_VariantEval" + stdEval.jobOutputFile = new File(".queue/logs/Overall/VariantEval.std.out") + stdEval.noST = true + stdEval.noEV = true + stdEval.evalModule ++= List("SimpleMetricsByAC", "TiTvVariantEvaluator", "CountVariants","GenotypeConcordance") + stdEval.stratificationModule ++= List("EvalRod", "CompRod", "Novelty","Sample") + stdEval.rodBind :+= RodBind("dbsnp", "vcf",DBSNP) + stdEval.rodBind :+= RodBind("evalContrastive", "VCF", ar.out) + stdEval.rodBind :+= RodBind("evalHandFilter","VCF",handFiltered) + stdEval.rodBind :+= RodBind("compHandFilter","VCF",handFiltered) + stdEval.rodBind :+= RodBind("compAxiom","VCF",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/affymetrix_axiom/Affymetrix_Axiom_DB_2010_v4_b37.noOmni.noHM3.vcf")) + stdEval.rodBind :+= RodBind("compOMNI","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/1212samples.b37.vcf")) + stdEval.out = swapExt(ar.out, ".vcf", ".eval") + stdEval.num_threads = Some(6) + + // Variant eval the flanking region + val flanksEval = new VariantEval with CommandLineGATKArgs + flanksEval.analysisName = base+"_VariantEval" + flanksEval.jobOutputFile = new File(".queue/logs/Overall/VariantEval.flanks.out") + flanksEval.intervals = List(ei.outList) + flanksEval.noST = true + flanksEval.noEV = true + flanksEval.evalModule ++= List("SimpleMetricsByAC", "TiTvVariantEvaluator", "CountVariants","GenotypeConcordance") + flanksEval.stratificationModule ++= List("EvalRod", "CompRod", "Novelty","Sample") + flanksEval.rodBind :+= RodBind("dbsnp", "vcf",DBSNP) + flanksEval.rodBind :+= RodBind("evalContrastive", "VCF", ar.out) + flanksEval.rodBind :+= RodBind("evalHandFilter","VCF",handFiltered) + flanksEval.rodBind :+= RodBind("compHandFilter","VCF",handFiltered) + flanksEval.rodBind :+= RodBind("compAxiom","VCF",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/affymetrix_axiom/Affymetrix_Axiom_DB_2010_v4_b37.noOmni.noHM3.vcf")) + flanksEval.rodBind :+= RodBind("compOMNI","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/1212samples.b37.vcf")) + flanksEval.out = swapExt(ar.out, ".vcf", ".flanks.eval") + flanksEval.num_threads = Some(6) + + // Make the bam list + val listOfBams = new File("Resources", base +".BamFiles.list") + + val writeBamList = new ListWriterFunction + writeBamList.analysisName = base + "_BamList" + writeBamList.jobOutputFile = new File(".queue/logs/Overall/WriteBamList.out") + writeBamList.inputFiles = bamFiles + writeBamList.listFile = listOfBams + + //add(indels, filteredIndels, snps, filteredSNPs, stdEval, writeBamList,cr,ar) + add(ar,stdEval) + + if (qscript.expandIntervals > 0) { + add(flanksEval) + } + + } +} diff --git a/scala/qscript/oneoffs/chartl/Phase1WholeGenome.scala b/scala/qscript/oneoffs/chartl/Phase1WholeGenome.scala new file mode 100755 index 000000000..372c2f9bd --- /dev/null +++ b/scala/qscript/oneoffs/chartl/Phase1WholeGenome.scala @@ -0,0 +1,148 @@ +import net.sf.picard.reference.FastaSequenceFile +import org.broadinstitute.sting.datasources.pipeline.Pipeline +import org.broadinstitute.sting.gatk.DownsampleType +import org.broadinstitute.sting.queue.extensions.gatk._ +import org.broadinstitute.sting.queue.extensions.picard.PicardBamJarFunction +import org.broadinstitute.sting.queue.extensions.samtools._ +import org.broadinstitute.sting.queue.{QException, QScript} +import collection.JavaConversions._ +import org.broadinstitute.sting.utils.yaml.YamlUtils +import org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType + +class Phase1WholeGenome extends QScript { + qscript => + + @Input(doc="path to GATK jar", shortName="gatk", required=true) + var gatkJar: File = _ + + @Input(doc="the chromosome to process", shortName="chr", required=true) + var chr: Int = _ + + @Input(doc="output path", shortName="outputDir", required=true) + var outputDir: String = _ + + @Input(doc="path to tmp space for storing intermediate bam files", shortName="outputTmpDir", required=true) + var outputTmpDir: String = _ + + private val reference: File = new File("/humgen/1kg/reference/human_g1k_v37.fasta") + private val dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf") + private val dindelCalls: String = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/AFR+EUR+ASN+1KG.dindel_august_release_merged_pilot1.20110126.sites.vcf" + val chromosomeLength = List(249250621,243199373,198022430,191154276,180915260,171115067,159138663,146364022,141213431,135534747,135006516,133851895,115169878,107349540,102531392,90354753,81195210,78077248,59128983,63025520,48129895,51304566) + val populations = List("ASW","CEU","CHB","CHS","CLM","FIN","GBR","IBS","JPT","LWK","MXL","PUR","TSI","YRI") //,"PPN") + + private var pipeline: Pipeline = _ + + trait CommandLineGATKArgs extends CommandLineGATK { + this.jarFile = qscript.gatkJar + this.reference_sequence = qscript.reference + this.memoryLimit = Some(3) + this.rodBind :+= RodBind("dbsnp", "VCF", qscript.dbSNP ) + } + + class AnalysisPanel(val baseName: String, val pops: List[String], val jobNumber: Int) { + val rawVCFsnps = new File(qscript.outputDir + "/calls/chr" + qscript.chr.toString + "/" + baseName + "/" + baseName + ".phase1.chr" + qscript.chr.toString + "." + jobNumber + ".raw.snps.vcf") + val rawVCFindels = new File(qscript.outputDir + "/calls/chr" + qscript.chr.toString + "/" + baseName + "/" + baseName + ".phase1.chr" + qscript.chr.toString + "." + jobNumber + ".raw.indels.vcf") + + val callSnps = new UnifiedGenotyper with CommandLineGATKArgs + callSnps.out = rawVCFsnps + callSnps.dcov = Some( 50 ) + callSnps.stand_call_conf = Some( 4.0 ) + callSnps.stand_emit_conf = Some( 4.0 ) + callSnps.baq = Some(org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.CALCULATE_AS_NECESSARY) + callSnps.jobName = qscript.outputTmpDir + "/calls/chr" + qscript.chr.toString + "/" +baseName + ".phase1.chr" + qscript.chr.toString + "." + jobNumber + ".raw.snps" + callSnps.exactCalculation = Some(org.broadinstitute.sting.gatk.walkers.genotyper.ExactAFCalculationModel.ExactCalculation.LINEAR_EXPERIMENTAL) + + val callIndels = new UnifiedGenotyper with CommandLineGATKArgs + callIndels.out = rawVCFindels + callIndels.dcov = Some( 50 ) + callIndels.stand_call_conf = Some( 10.0 ) + callIndels.stand_emit_conf = Some( 10.0 ) + callIndels.baq = Some(org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.OFF) + callIndels.glm = Some(org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.Model.DINDEL) + callIndels.minIndelCnt = Some(5) + callIndels.read_filter :+= "Platform454" + callIndels.jobName = qscript.outputTmpDir + "/calls/chr" + qscript.chr.toString + "/" +baseName + ".phase1.chr" + qscript.chr.toString + "." + jobNumber + ".raw.indels" + callIndels.exactCalculation = Some(org.broadinstitute.sting.gatk.walkers.genotyper.ExactAFCalculationModel.ExactCalculation.LINEAR_EXPERIMENTAL) + callIndels.abort_at_too_much_coverage = Some(4500) + } + + def script = { + val basesPerJob: Int = 700000 + val lastBase: Int = qscript.chromosomeLength(qscript.chr - 1) + var start: Int = 1 + var stop: Int = start - 1 + basesPerJob + if( stop > lastBase ) { stop = lastBase } + var jobNumber: Int = 1 + while( jobNumber < (lastBase.toFloat / basesPerJob.toFloat) + 1.0) { + callThisChunk("%d:%d-%d".format(qscript.chr, start, stop), jobNumber) + start += basesPerJob + stop += basesPerJob + if( stop > lastBase ) { stop = lastBase } + jobNumber += 1 + } + } + + + def callThisChunk(interval: String, jobNumber: Int) = { + + val AFR = new AnalysisPanel("AFR", List("LWK","YRI","ASW","CLM","PUR"), jobNumber) + val AMR = new AnalysisPanel("AMR", List("MXL","CLM","PUR","ASW"), jobNumber) + val EUR = new AnalysisPanel("EUR", List("CEU","FIN","GBR","TSI","IBS","MXL","CLM","PUR","ASW"), jobNumber) + val ASN = new AnalysisPanel("ASN", List("CHB","CHS","JPT","MXL","CLM","PUR"), jobNumber) + //val PAA = new AnalysisPanel("PAA", List("PPN","YRI","CHB"), jobNumber) + val analysisPanels = List(AFR, ASN, AMR, EUR) + + for( population <- qscript.populations ) { + val baseTmpName: String = qscript.outputTmpDir + "/calls/chr" + qscript.chr.toString + "/" + population + ".phase1.chr" + qscript.chr.toString + "." + jobNumber.toString + "." + val bamList: File = new File("/humgen/1kg/processing/allPopulations_wholeGenome_phase1_release/bam_lists/%s.list".format(population)) + val targetIntervals: File = new File(baseTmpName + "target.intervals") + + // 1.) Create cleaning targets + val target = new RealignerTargetCreator with CommandLineGATKArgs + target.memoryLimit = Some(3) + target.input_file :+= bamList + target.intervalsString :+= interval + target.out = targetIntervals + target.mismatchFraction = Some(0.0) + target.maxIntervalSize = Some(700) + target.rodBind :+= RodBind("indels1", "VCF", qscript.dindelCalls) + target.jobName = baseTmpName + "target" + target.isIntermediate = true + + // 2.) Clean without SW + val clean = new IndelRealigner with CommandLineGATKArgs + val cleanedBam = new File(baseTmpName + "cleaned.bam") + clean.memoryLimit = Some(4) + clean.input_file :+= bamList + clean.intervalsString :+= interval + clean.targetIntervals = targetIntervals + clean.out = cleanedBam + clean.doNotUseSW = true + clean.baq = Some(org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.RECALCULATE) + clean.rodBind :+= RodBind("indels1", "VCF", qscript.dindelCalls) + clean.jobName = baseTmpName + "clean" + clean.isIntermediate = true + clean.compress = Some(0) + clean.index_output_bam_on_the_fly = Some(true) + clean.sortInCoordinateOrderEvenThoughItIsHighlyUnsafe = true + + add(target, clean) + + for( a <- analysisPanels ) { + for( p <- a.pops) { + if( p == population ) { + a.callSnps.input_file :+= cleanedBam + a.callIndels.input_file :+= cleanedBam + } + } + } + } + + for( a <- analysisPanels ) { + a.callSnps.intervalsString :+= interval + a.callIndels.intervalsString :+= interval + add(a.callSnps, a.callIndels) + } + + } +} diff --git a/scala/qscript/oneoffs/chartl/RFAPipeline.q b/scala/qscript/oneoffs/chartl/RFAPipeline.q new file mode 100755 index 000000000..de24571d5 --- /dev/null +++ b/scala/qscript/oneoffs/chartl/RFAPipeline.q @@ -0,0 +1,67 @@ +import org.broadinstitute.sting.commandline.ArgumentCollection +import org.broadinstitute.sting.gatk.CommandLineGATK +import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.RFAArgumentCollection +import org.broadinstitute.sting.queue.extensions.gatk.{RodBind, RFCombine, RFExtractor} +import org.broadinstitute.sting.queue.QScript +import org.broadinstitute.sting.queue.util.IOUtils + +class RFAPipeline extends QScript { + rfapipeline => + + @Argument(doc="bam list",shortName="bams") var bamList: File = _ + @Argument(doc="sting dir",shortName="sting") var stingDir: String = _ + @Argument(doc="reference file",shortName="ref") var ref: File = _ + @Argument(doc="output base",shortName="o") var baseOut: String = _ + @Argument(doc="interval list",shortName="L") var intervals: File = _ + @ArgumentCollection var rfaArgs : RFAArgumentCollection = new RFAArgumentCollection() + @Argument(doc="Number of bams per text file output",shortName="br",required = false) var br = 20 + + def script = { + // step one, break the bam files up + var subBams : Iterator[List[File]] = extractFileEntries(bamList).toList.grouped(br) + + trait ExtractorArgs extends RFExtractor { + this.reference_sequence = rfapipeline.ref + this.jarFile = new File(rfapipeline.stingDir+"/dist/GenomeAnalysisTK.jar") + this.intervals :+= rfapipeline.intervals + // copy the args into the extractor + this.windowJump = Some(rfaArgs.windowJump) + this.windowSize = Some(rfaArgs.windowSize) + this.fixedZ = Some(rfaArgs.fixedZ) + this.perSampleZ = Some(rfaArgs.sampleZThresh) + this.HighInsertSize= Some(rfaArgs.highInsertSize) + this.LowInsertSize = Some(rfaArgs.lowInsertSize) + this.clippedBases = Some(rfaArgs.clippedBases) + this.sampleEpsilon = Some(rfaArgs.EPSILON) + this.memoryLimit = Some(2) + } + + val extract : List[RFExtractor] = subBams.zipWithIndex.map( u => { + var g = new RFExtractor with ExtractorArgs + g.input_file ++= u._1 + g.out = new File("%s.%d.txt".format(rfapipeline.baseOut,u._2)) + g + }).toList + + addAll(extract) + + trait CombineArgs extends RFCombine { + this.reference_sequence = rfapipeline.ref + this.jarFile = new File(rfapipeline.stingDir+"/dist/GenomeAnalysisTK.jar") + } + + var combine : RFCombine = new RFCombine with CombineArgs + var idx : Int = 0 + extract.foreach( ex => { + val name = "s%d".format(idx) + val exRB = new RodBind(name,"table",ex.out) + combine.rodBind :+= exRB + combine.memoryLimit = Some(6) + idx+=1; + }) + + combine.out = new File(baseOut) + + add(combine) + } +} \ No newline at end of file