Putting new association files, some qscripts, and the new pick sequenom probes file under local version control. I notice some dumb emacs backup files, I'll kill those off momentarily. Also minor changes to GenomeLoc (getStartLoc() and getEndLoc() convenience methods)
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@6034 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
95614ce3d6
commit
9b90e9385d
|
|
@ -0,0 +1,84 @@
|
||||||
|
package org.broadinstitute.sting.oneoffprojects.walkers.newassociation;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.commandline.Output;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
import org.broadinstitute.sting.gatk.datasources.sample.Sample;
|
||||||
|
import org.broadinstitute.sting.gatk.executive.TreeReducer;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
|
||||||
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
|
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||||
|
|
||||||
|
import java.io.PrintStream;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: chartl
|
||||||
|
* Date: 6/14/11
|
||||||
|
* Time: 10:14 AM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
public class LocusDepthProportionWalker extends LocusWalker<double[],Boolean> implements TreeReducible<Boolean> {
|
||||||
|
|
||||||
|
@Output
|
||||||
|
PrintStream out;
|
||||||
|
|
||||||
|
private Map<String,Integer> samOrder;
|
||||||
|
|
||||||
|
public void initialize() {
|
||||||
|
samOrder = new HashMap<String,Integer>(getToolkit().getSAMFileSamples().size());
|
||||||
|
int idx = 0;
|
||||||
|
out.printf("pos");
|
||||||
|
for ( Sample s : getToolkit().getSAMFileSamples() ) {
|
||||||
|
out.printf("\t");
|
||||||
|
out.printf(s.getId());
|
||||||
|
samOrder.put(s.getId(),idx++);
|
||||||
|
}
|
||||||
|
out.printf("\t%s%n","total");
|
||||||
|
}
|
||||||
|
|
||||||
|
public Boolean reduceInit() { return null; }
|
||||||
|
|
||||||
|
public double[] map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||||
|
|
||||||
|
if ( ref == null || ! context.hasBasePileup() || ! context.hasReads() ) { return null; }
|
||||||
|
|
||||||
|
out.print(ref.getLocus());
|
||||||
|
double[] props = new double[1+samOrder.size()];
|
||||||
|
|
||||||
|
// one pass this
|
||||||
|
int nReads = context.size();
|
||||||
|
|
||||||
|
for ( PileupElement e : context.getBasePileup() ) {
|
||||||
|
props[samOrder.get(e.getRead().getReadGroup().getSample())] += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
for ( int idx = 0; idx < props.length -1 ; idx ++ ) {
|
||||||
|
props[idx] /= nReads;
|
||||||
|
}
|
||||||
|
|
||||||
|
props[props.length-1] = nReads;
|
||||||
|
|
||||||
|
return props;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Boolean reduce(double[] map, Boolean pr) {
|
||||||
|
if ( map == null ) { return null; }
|
||||||
|
|
||||||
|
StringBuffer buf = new StringBuffer();
|
||||||
|
for ( double d : map ) {
|
||||||
|
buf.append("\t");
|
||||||
|
buf.append(String.format("%.4f",d));
|
||||||
|
}
|
||||||
|
|
||||||
|
out.printf("%s%n",buf.toString());
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Boolean treeReduce(Boolean a, Boolean b) { return null; }
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,54 @@
|
||||||
|
package org.broadinstitute.sting.oneoffprojects.walkers.newassociation;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.commandline.Argument;
|
||||||
|
import org.broadinstitute.sting.commandline.ArgumentCollection;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Argument collection for the read feature associator and related walkers
|
||||||
|
*/
|
||||||
|
public class RFAArgumentCollection {
|
||||||
|
|
||||||
|
@Argument(doc="ReadFeatures you want to test. None specified = all will be tested.",required=false,shortName="f",fullName="Feature")
|
||||||
|
public List<String> inputFeatures = null;
|
||||||
|
|
||||||
|
@Argument(doc="Size of window on which to perform tests",required=false,fullName="windowSize")
|
||||||
|
public int windowSize = 50;
|
||||||
|
|
||||||
|
@Argument(doc="Size of the jump between tested windows",required=false,fullName="windowJump")
|
||||||
|
public int windowJump = 10;
|
||||||
|
|
||||||
|
@Argument(doc="File containing a list of case samples",required=false,shortName="case",fullName="case")
|
||||||
|
public File caseFile = null;
|
||||||
|
|
||||||
|
@Argument(doc="File containing a list of control samples",required=false,shortName="control",fullName="control")
|
||||||
|
public File controlFile = null;
|
||||||
|
|
||||||
|
@Argument(doc="Fixed significance level, as a Z-score",required=false,shortName="z",fullName="fixedZ")
|
||||||
|
public double fixedZ = 6.0;
|
||||||
|
|
||||||
|
@Argument(doc="Insert size below which to flag a read as aberrant",required=false,shortName="LIS",fullName="LowInsertSize")
|
||||||
|
public int lowInsertSize = 50;
|
||||||
|
|
||||||
|
@Argument(doc="Insert size above which to flag a read as aberrant",required=false,shortName="HIS",fullName="HighInsertSize")
|
||||||
|
public int highInsertSize = 450;
|
||||||
|
|
||||||
|
@Argument(doc="Significance level for determining whether a sample contains a significant proportion of affected reads",required=false,shortName="sz",fullName="perSampleZ")
|
||||||
|
public double sampleZThresh = 3.0;
|
||||||
|
|
||||||
|
@Argument(doc="Lower bound for significant proportion of affected reads",shortName="se",fullName="sampleEpsilon",required=false)
|
||||||
|
public double EPSILON = 0.05;
|
||||||
|
|
||||||
|
@Argument(doc="Number of clipped bases for a read to be considered \"split\"",required=false,shortName="cb",fullName="clippedBases")
|
||||||
|
public short clippedBases = 4;
|
||||||
|
|
||||||
|
/* todo -- these for a possible "split read" binarification of clipped bases -- if there's a fast way to traverse clipped bases
|
||||||
|
@Argument(doc="Minimum base quality for a clipped base to be indicative of a split read",required=false,shortName="CLBQ",fullName="CLBQ")
|
||||||
|
public short clbq = 10;
|
||||||
|
|
||||||
|
@Argument(doc="Minimum base quality sum for clipped bases (as a string) to be indicative of a split read",required=false,shortName="CLBS",fullName="CLBS")
|
||||||
|
public short clbs=50;
|
||||||
|
*/
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,54 @@
|
||||||
|
package org.broadinstitute.sting.oneoffprojects.walkers.newassociation;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.commandline.Argument;
|
||||||
|
import org.broadinstitute.sting.commandline.ArgumentCollection;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Argument collection for the read feature associator and related walkers
|
||||||
|
*/
|
||||||
|
public class RFAArgumentCollection {
|
||||||
|
|
||||||
|
@Argument(doc="ReadFeatures you want to test. None specified = all will be tested.",required=false,shortName="f",fullName="Feature")
|
||||||
|
public List<String> inputFeatures = null;
|
||||||
|
|
||||||
|
@Argument(doc="Size of window on which to perform tests",required=false,fullName="windowSize")
|
||||||
|
public int windowSize = 50;
|
||||||
|
|
||||||
|
@Argument(doc="Size of the jump between tested windows",required=false,fullName="windowJump")
|
||||||
|
public int windowJump = 10;
|
||||||
|
|
||||||
|
@Argument(doc="File containing a list of case samples",required=false,shortName="case",fullName="case")
|
||||||
|
public File caseFile = null;
|
||||||
|
|
||||||
|
@Argument(doc="File containing a list of control samples",required=false,shortName="control",fullName="control")
|
||||||
|
public File controlFile = null;
|
||||||
|
|
||||||
|
@Argument(doc="Fixed significance level, as a Z-score",required=false,shortName="z",fullName="fixedZ")
|
||||||
|
public double fixedZ = 6.0;
|
||||||
|
|
||||||
|
@Argument(doc="Insert size below which to flag a read as aberrant",required=false,shortName="LIS",fullName="LowInsertSize")
|
||||||
|
public int lowInsertSize = 50;
|
||||||
|
|
||||||
|
@Argument(doc="Insert size above which to flag a read as aberrant",required=false,shortName="HIS",fullName="HighInsertSize")
|
||||||
|
public int highInsertSize = 450;
|
||||||
|
|
||||||
|
@Argument(doc="Significance level for determining whether a sample contains a significant proportion of affected reads",required=false,shortName="sz",fullName="perSampleZ")
|
||||||
|
public double sampleZThresh = 3.0;
|
||||||
|
|
||||||
|
@Argument(doc="Lower bound for significant proportion of affected reads",shortName="se",fullName="sampleEpsilon",required=false)
|
||||||
|
public double EPSILON = 0.05;
|
||||||
|
|
||||||
|
@Argument(doc="Number of clipped bases for a read to be considered \"split\"",required=false,shortName="cb",fullName="clippedBases")
|
||||||
|
public short clippedBases = 5;
|
||||||
|
|
||||||
|
/* todo -- these for a possible "split read" binarification of clipped bases -- if there's a fast way to traverse clipped bases
|
||||||
|
@Argument(doc="Minimum base quality for a clipped base to be indicative of a split read",required=false,shortName="CLBQ",fullName="CLBQ")
|
||||||
|
public short clbq = 10;
|
||||||
|
|
||||||
|
@Argument(doc="Minimum base quality sum for clipped bases (as a string) to be indicative of a split read",required=false,shortName="CLBS",fullName="CLBS")
|
||||||
|
public short clbs=50;
|
||||||
|
*/
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,335 @@
|
||||||
|
package org.broadinstitute.sting.oneoffprojects.walkers.newassociation;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broadinstitute.sting.commandline.ArgumentCollection;
|
||||||
|
import org.broadinstitute.sting.commandline.Output;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
import org.broadinstitute.sting.gatk.datasources.sample.Sample;
|
||||||
|
import org.broadinstitute.sting.gatk.filters.*;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.ReadFilters;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||||
|
import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features.*;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.MathUtils;
|
||||||
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
|
import org.broadinstitute.sting.utils.classloader.PluginManager;
|
||||||
|
import org.broadinstitute.sting.utils.collections.Pair;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
|
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||||
|
|
||||||
|
import java.io.FileNotFoundException;
|
||||||
|
import java.io.PrintStream;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Read feature association walker -- associates read features between dichotomized, or multi-group cohorts
|
||||||
|
* todo -- need a heuristic to stop doing tests where there is certainly no signal
|
||||||
|
* todo -- for most features there's a nuisance variable which is the proportion of *paired* reads, perhaps a pair-only setting for read features
|
||||||
|
*/
|
||||||
|
|
||||||
|
@ReadFilters({MaxInsertSizeFilter.class,MappingQualityReadFilter.class,DuplicateReadFilter.class,FailsVendorQualityCheckReadFilter.class,NotPrimaryAlignmentReadFilter.class,UnmappedReadFilter.class})
|
||||||
|
public class RFAWalker extends ReadWalker<SAMRecord,RFWindow> {
|
||||||
|
// todo -- this needs to be an argument collection that can get passed around to initialize read features etc
|
||||||
|
@ArgumentCollection
|
||||||
|
private RFAArgumentCollection collection = new RFAArgumentCollection();
|
||||||
|
|
||||||
|
@Output
|
||||||
|
public PrintStream out;
|
||||||
|
|
||||||
|
Map<String,Boolean> caseStatus;
|
||||||
|
|
||||||
|
protected List<ReadFeatureAggregator> aggregators; // no re-instantiation, use a list to ensure ordering
|
||||||
|
|
||||||
|
protected Iterator<GenomeLoc> locusIterator;
|
||||||
|
protected GenomeLoc iteratorLoc;
|
||||||
|
protected GenomeLoc loc;
|
||||||
|
protected String sample;
|
||||||
|
private List<String> EMPTY_LIST = new ArrayList<String>(0);
|
||||||
|
|
||||||
|
public void initialize() {
|
||||||
|
if ( collection.windowSize % collection.windowJump != 0 ) {
|
||||||
|
throw new UserException("Window size is not divisible by window jump.");
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( collection.caseFile == null || collection.controlFile == null ) {
|
||||||
|
throw new UserException("You must provide both a case file (-case) and a control file (-control) each listing those samples belonging to the cohort");
|
||||||
|
}
|
||||||
|
|
||||||
|
caseStatus = new HashMap<String,Boolean>(getToolkit().getSAMFileSamples().size());
|
||||||
|
try {
|
||||||
|
for ( String sample : new XReadLines(collection.caseFile) ) {
|
||||||
|
caseStatus.put(sample,true);
|
||||||
|
}
|
||||||
|
for ( String sample : new XReadLines(collection.controlFile)) {
|
||||||
|
caseStatus.put(sample,false);
|
||||||
|
}
|
||||||
|
|
||||||
|
for ( Sample sample : getToolkit().getSAMFileSamples() ) {
|
||||||
|
if ( ! caseStatus.containsKey(sample.getId())) {
|
||||||
|
throw new UserException("No case/control status for sample "+sample.getId());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} catch ( FileNotFoundException e ) {
|
||||||
|
throw new UserException("Unable to open a case/control file",e);
|
||||||
|
}
|
||||||
|
|
||||||
|
Set<Class<? extends ReadFeatureAggregator>> aggregatorSet = getFeatureAggregators(collection.inputFeatures);
|
||||||
|
Set<ReadFeatureAggregator> rfHolder1 = new HashSet<ReadFeatureAggregator>(aggregatorSet.size());
|
||||||
|
try {
|
||||||
|
for ( Class<? extends ReadFeatureAggregator> featureClass : aggregatorSet ) {
|
||||||
|
ReadFeatureAggregator readFeature = featureClass.getConstructor(RFAArgumentCollection.class).newInstance(collection);
|
||||||
|
rfHolder1.add(readFeature);
|
||||||
|
}
|
||||||
|
} catch ( Exception e ) {
|
||||||
|
throw new StingException("A read feature instantiation error occurred during initialization",e);
|
||||||
|
}
|
||||||
|
|
||||||
|
ReadFeatureAggregator[] rfHolder2 = new ReadFeatureAggregator[rfHolder1.size()];
|
||||||
|
int idx = 0;
|
||||||
|
for ( ReadFeatureAggregator f : rfHolder1 ) {
|
||||||
|
rfHolder2[idx++] = f;
|
||||||
|
}
|
||||||
|
Arrays.sort(rfHolder2, new Comparator<ReadFeatureAggregator>() {
|
||||||
|
@Override
|
||||||
|
public int compare(ReadFeatureAggregator a, ReadFeatureAggregator b) {
|
||||||
|
return a.getClass().getSimpleName().compareTo(b.getClass().getSimpleName());
|
||||||
|
}
|
||||||
|
});
|
||||||
|
aggregators = Arrays.asList(rfHolder2);
|
||||||
|
|
||||||
|
writeHeader();
|
||||||
|
|
||||||
|
locusIterator = getToolkit().getIntervals().iterator();
|
||||||
|
iteratorLoc = locusIterator.hasNext() ? locusIterator.next() : null;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public RFWindow reduceInit() {
|
||||||
|
Set<String> samples = new HashSet<String>(getToolkit().getSamples().size());
|
||||||
|
for ( Sample s : getToolkit().getSamples() ) {
|
||||||
|
samples.add(s.getId());
|
||||||
|
}
|
||||||
|
return new RFWindow(aggregators,collection,caseStatus,getToolkit().getGenomeLocParser());
|
||||||
|
}
|
||||||
|
|
||||||
|
public SAMRecord map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
|
||||||
|
if ( ref == null ) { return null; } // unmapped reads have null ref contexts
|
||||||
|
//loc = getToolkit().getGenomeLocParser().createGenomeLoc(ref.getLocus().getContig(),read.getAlignmentStart());
|
||||||
|
GenomeLoc newLoc = ref.getLocus().getStartLocation(); // can be problematic if read aligns prior to start of contig -- should never happen
|
||||||
|
if ( newLoc.isPast(iteratorLoc.getStartLocation()) ) {
|
||||||
|
loc = newLoc;
|
||||||
|
} else {
|
||||||
|
loc = iteratorLoc.getStartLocation();
|
||||||
|
}
|
||||||
|
if ( read == null ) { return null; }
|
||||||
|
sample = read.getReadGroup().getSample();
|
||||||
|
return read;
|
||||||
|
}
|
||||||
|
|
||||||
|
public RFWindow reduce(SAMRecord read, RFWindow prevReduce) {
|
||||||
|
if ( iteratorLoc != null && iteratorLoc.isBefore(loc) ) {// test if read is past end of the user interval
|
||||||
|
//logger.info(String.format("iteratorLoc: %s loc: %s",iteratorLoc.toString(),loc.toString()));
|
||||||
|
onIntervalDone(prevReduce);
|
||||||
|
iteratorLoc = locusIterator.hasNext() ? locusIterator.next() : null;
|
||||||
|
if ( loc.startsBefore(iteratorLoc) ) {
|
||||||
|
loc = iteratorLoc.getStartLocation();
|
||||||
|
}
|
||||||
|
reduce(read,prevReduce);
|
||||||
|
} else if ( read != null ) {
|
||||||
|
// todo -- what happens if first read of an interval is not before or at the start of the interval?\
|
||||||
|
List<Pair<GenomeLoc,Map<String,List<ReadFeatureAggregator>>>> completed = prevReduce.inc(read, loc, sample,iteratorLoc);
|
||||||
|
// todo -- run tests here; for now just log that a window/multiple windows are complete
|
||||||
|
if ( completed.size() > 0 ) {
|
||||||
|
// System.out.printf("At %s we have seen %d completed windows%n",loc,completed.size())
|
||||||
|
// bed format
|
||||||
|
int locShift = 0;
|
||||||
|
for ( Pair<GenomeLoc,Map<String,List<ReadFeatureAggregator>>> samWindow : completed ) {
|
||||||
|
GenomeLoc window = samWindow.first;
|
||||||
|
runWindowTests(samWindow.second, window);
|
||||||
|
locShift += collection.windowJump;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return prevReduce;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public RFWindow onIntervalDone(RFWindow rWindow) {
|
||||||
|
//logger.info("In onIntervalDone at genome loc "+iteratorLoc.toString()+" with read loc "+loc.toString());
|
||||||
|
List<Pair<GenomeLoc,Map<String,List<ReadFeatureAggregator>>>> completed = rWindow.flush(iteratorLoc);
|
||||||
|
int locShift = 0;
|
||||||
|
for ( Pair<GenomeLoc,Map<String,List<ReadFeatureAggregator>>> samWindow : completed ) {
|
||||||
|
GenomeLoc window = samWindow.first;
|
||||||
|
runWindowTests(samWindow.second, window);
|
||||||
|
locShift += collection.windowJump;
|
||||||
|
}
|
||||||
|
|
||||||
|
return rWindow;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Set<Class<? extends ReadFeatureAggregator>> getFeatureAggregators(List<String> requestedFeatures) {
|
||||||
|
HashSet<Class<? extends ReadFeatureAggregator>> newFeatureSet = new HashSet<Class<? extends ReadFeatureAggregator>>();
|
||||||
|
List<Class<? extends ReadFeatureAggregator>> availableFeatures = new PluginManager<ReadFeatureAggregator>(ReadFeatureAggregator.class).getPlugins();
|
||||||
|
|
||||||
|
if ( collection.inputFeatures == null ) {
|
||||||
|
newFeatureSet.addAll(availableFeatures);
|
||||||
|
return newFeatureSet;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Map<String,Class<? extends ReadFeatureAggregator>> classNameToClass = new HashMap<String,Class<? extends ReadFeatureAggregator>>(collection.inputFeatures.size());
|
||||||
|
for ( Class<? extends ReadFeatureAggregator> clazz : availableFeatures ) {
|
||||||
|
classNameToClass.put(clazz.getSimpleName(),clazz);
|
||||||
|
}
|
||||||
|
|
||||||
|
for ( String s : requestedFeatures) {
|
||||||
|
if ( classNameToClass.containsKey(s) ) {
|
||||||
|
newFeatureSet.add(classNameToClass.get(s));
|
||||||
|
} else {
|
||||||
|
throw new UserException("The name "+s+" does not correspond to an available read feature class.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return newFeatureSet;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void runWindowTests(Map<String,List<ReadFeatureAggregator>> window, GenomeLoc loc) {
|
||||||
|
// two main tests: fixed-significance shift, and confidence-interval sum
|
||||||
|
//System.out.printf("Running tests...%n");
|
||||||
|
// todo -- really the aggregators should be iterated over directly (rather than indirectly through the index)
|
||||||
|
out.printf("%s\t%d\t%d",loc.getContig(),loc.getStart(),loc.getStop());
|
||||||
|
for ( int agIdx = 0; agIdx < aggregators.size(); agIdx ++ ) {
|
||||||
|
double fixedDelta = fixedSignificance(window.get("case").get(agIdx),window.get("control").get(agIdx));
|
||||||
|
//double weightedDelta = confidenceIntervalSum(window,agIdx);
|
||||||
|
//out.printf("\t%.2e\t%.2e",Double.isNaN(fixedDelta) ? 0.0 : fixedDelta,weightedDelta);
|
||||||
|
Pair<List<String>,List<String>> caseControlAffected = getAffectedSamples(window,agIdx,fixedDelta);
|
||||||
|
List<String> cases = caseControlAffected.getFirst();
|
||||||
|
List<String> controls = caseControlAffected.getSecond();
|
||||||
|
out.printf("\t%.2e\t%d:%d\t%.2e\t%s;%s",fixedDelta,cases.size(),controls.size(), MathUtils.binomialProbability(cases.size(),cases.size()+controls.size(),0.5),Utils.join(",",cases),Utils.join(",",controls));
|
||||||
|
|
||||||
|
}
|
||||||
|
out.printf("%n");
|
||||||
|
}
|
||||||
|
|
||||||
|
public Pair<List<String>,List<String>> getAffectedSamples(Map<String,List<ReadFeatureAggregator>> aggregators, int idx, double fixedDelta) {
|
||||||
|
if ( fixedDelta == 0.0 ) { return new Pair<List<String>,List<String>>(EMPTY_LIST,EMPTY_LIST); } // todo -- too hacky
|
||||||
|
|
||||||
|
Pair<List<String>,List<String>> ccSampleList = new Pair<List<String>,List<String>>(new ArrayList<String>(), new ArrayList<String>());
|
||||||
|
for ( Map.Entry<String,List<ReadFeatureAggregator>> entry : aggregators.entrySet() ) {
|
||||||
|
if ( entry.getKey().equals("case") || entry.getKey().equals("control")) { continue; }
|
||||||
|
ReadFeatureAggregator aggregator = entry.getValue().get(idx);
|
||||||
|
// is this sending a truly significant signal
|
||||||
|
double zs = (aggregator.getMean() - collection.EPSILON) * Math.sqrt(aggregator.getnReads())/Math.sqrt(aggregator.getUnbiasedVar());
|
||||||
|
if ( zs > collection.sampleZThresh ) {
|
||||||
|
if ( caseStatus.get(entry.getKey()) ) {
|
||||||
|
ccSampleList.first.add(entry.getKey());
|
||||||
|
} else {
|
||||||
|
ccSampleList.second.add(entry.getKey());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return ccSampleList;
|
||||||
|
}
|
||||||
|
|
||||||
|
public double fixedSignificance(ReadFeatureAggregator caseAg, ReadFeatureAggregator controlAg) {
|
||||||
|
if ( caseAg.getnReads() == 0 || controlAg.getnReads() == 0 ) {
|
||||||
|
return 0.0;
|
||||||
|
}
|
||||||
|
double stat_num = caseAg.getMean() - controlAg.getMean();
|
||||||
|
double stat_denom = Math.sqrt(caseAg.getUnbiasedVar()/caseAg.getnReads() + controlAg.getUnbiasedVar()/controlAg.getnReads());
|
||||||
|
double stat = stat_num/stat_denom;
|
||||||
|
//System.out.printf("Mean_dif: %.2e Var: %.2e Stat: %.2f Z: %.2f SS-ZZ: %.2f%n",stat_num,stat_denom,stat, collection.fixedZ, stat*stat-collection.fixedZ*collection.fixedZ);
|
||||||
|
if ( stat*stat < collection.fixedZ*collection.fixedZ ) {
|
||||||
|
return 0.0;
|
||||||
|
} else {
|
||||||
|
//System.out.printf("Calculating delta: %.2f%n",(stat < 0) ? stat_denom*(-1*collection.fixedZ-stat) : stat_denom*(stat-collection.fixedZ));
|
||||||
|
//return (stat > 0) ? stat_denom*(stat-collection.fixedZ) : stat_denom*(stat+collection.fixedZ);
|
||||||
|
return stat_num;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public double confidenceIntervalSum(Map<String,List<ReadFeatureAggregator>> window, int offset) {
|
||||||
|
// this comment serves as an explicit normality assumption (e.g. that the DF will be large)
|
||||||
|
double caseWeightMean = 0.0;
|
||||||
|
double caseWeightVar = 0.0;
|
||||||
|
double caseSumWeight = 0.0;
|
||||||
|
double controlWeightMean = 0.0;
|
||||||
|
double controlWeightVar = 0.0;
|
||||||
|
double controlSumWeight = 0.0;
|
||||||
|
for ( Map.Entry<String,List<ReadFeatureAggregator>> sampleEntry : window.entrySet() ) {
|
||||||
|
if ( ! sampleEntry.getKey().equals("case") && ! sampleEntry.getKey().equals("control") ) {
|
||||||
|
// first check if the sample is shifted from zero (CI does not include zero)
|
||||||
|
// todo -- fixme. This will always be true for insert sizes, clipped reads, mapping quality...should have an avg value
|
||||||
|
|
||||||
|
ReadFeatureAggregator aggregator = sampleEntry.getValue().get(offset);
|
||||||
|
if ( aggregator.getnReads() == 0 ) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean shifted;
|
||||||
|
int shiftThresh = 0;/*
|
||||||
|
// todo -- this is fucking awful
|
||||||
|
if ( aggregator instanceof InsertSize ) {
|
||||||
|
shiftThresh = 150;
|
||||||
|
}
|
||||||
|
if ( aggregator instanceof ClippedBases ) {
|
||||||
|
shiftThresh = 6;
|
||||||
|
}*/
|
||||||
|
if ( aggregator.getMean() < shiftThresh ) {
|
||||||
|
// use fixedZ/4 to be a little bit less strenuous -- todo -- make an input?
|
||||||
|
shifted = aggregator.getMean() + collection.fixedZ/4*Math.sqrt(aggregator.getUnbiasedVar())/aggregator.getnReads() < shiftThresh;
|
||||||
|
} else {
|
||||||
|
shifted = aggregator.getMean() - collection.fixedZ/4*Math.sqrt(aggregator.getUnbiasedVar())/aggregator.getnReads() > shiftThresh;;
|
||||||
|
}
|
||||||
|
if ( shifted ) {
|
||||||
|
double twoS2 = 2*aggregator.getUnbiasedVar()*aggregator.getUnbiasedVar();
|
||||||
|
if ( caseStatus.get(sampleEntry.getKey())) {
|
||||||
|
caseWeightMean += (aggregator.getnReads()-1.0)*aggregator.getMean()/(twoS2);
|
||||||
|
caseWeightVar += aggregator.getUnbiasedVar()*Math.pow((aggregator.getnReads()-1.0)/(twoS2),2);
|
||||||
|
caseSumWeight += (aggregator.getnReads()-1.0)/(twoS2);
|
||||||
|
} else {
|
||||||
|
controlWeightMean += (aggregator.getnReads()-1.0)*aggregator.getMean()/(twoS2);
|
||||||
|
controlWeightVar += aggregator.getUnbiasedVar()*Math.pow((aggregator.getnReads()-1.0)/(twoS2),2);
|
||||||
|
controlSumWeight += (aggregator.getnReads()-1.0)/(twoS2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
double caseGaussianMean = caseWeightMean/caseSumWeight;
|
||||||
|
double controlGaussianMean = controlWeightMean/controlSumWeight;
|
||||||
|
double caseGaussianVar = caseWeightVar/(caseSumWeight*caseSumWeight);
|
||||||
|
double controlGaussianVar = controlWeightVar/(controlSumWeight*controlSumWeight);
|
||||||
|
// todo -- is the z-factor an appropriate statistic?
|
||||||
|
//return 1.0 - 3*(caseGaussianVar+controlGaussianVar)/Math.abs(caseGaussianMean-controlGaussianMean);
|
||||||
|
if ( caseGaussianMean > controlGaussianMean ) {
|
||||||
|
// want to examine the case lower fixedZ*stdev vs the control upper fixedZ*stev
|
||||||
|
return (caseGaussianMean-collection.fixedZ/4*Math.sqrt(caseGaussianVar)) - (controlGaussianMean + collection.fixedZ/4*Math.sqrt(controlGaussianVar));
|
||||||
|
} else {
|
||||||
|
// want to examine the case upper fixedZ*stev vs the control lower fixedZ*stev
|
||||||
|
return (controlGaussianMean-collection.fixedZ/4*Math.sqrt(controlGaussianVar)) - ( caseGaussianMean + collection.fixedZ/4*Math.sqrt(caseGaussianVar));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void writeHeader() {
|
||||||
|
// "%.2e\t%d:%d\t%s,%s\t%.2e"
|
||||||
|
StringBuffer buf = new StringBuffer();
|
||||||
|
buf.append("description=chr,start,stop");
|
||||||
|
for ( ReadFeatureAggregator f : aggregators ) {
|
||||||
|
buf.append(",");
|
||||||
|
buf.append(f.getClass().getSimpleName());
|
||||||
|
buf.append("-d,");
|
||||||
|
buf.append(f.getClass().getSimpleName());
|
||||||
|
buf.append("-r,");
|
||||||
|
buf.append(f.getClass().getSimpleName());
|
||||||
|
buf.append("-s,");
|
||||||
|
buf.append(f.getClass().getSimpleName());
|
||||||
|
buf.append("-p");
|
||||||
|
}
|
||||||
|
out.printf("track type=bedTable %s%n",buf);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,264 @@
|
||||||
|
package org.broadinstitute.sting.oneoffprojects.walkers.newassociation;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broadinstitute.sting.commandline.ArgumentCollection;
|
||||||
|
import org.broadinstitute.sting.commandline.Output;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
import org.broadinstitute.sting.gatk.datasources.sample.Sample;
|
||||||
|
import org.broadinstitute.sting.gatk.filters.*;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.ReadFilters;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||||
|
import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features.*;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.classloader.PluginManager;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
|
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||||
|
|
||||||
|
import java.io.FileNotFoundException;
|
||||||
|
import java.io.PrintStream;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Read feature association walker -- associates read features between dichotomized, or multi-group cohorts
|
||||||
|
* todo -- need a heuristic to stop doing tests where there is certainly no signal
|
||||||
|
* todo -- for most features there's a nuisance variable which is the proportion of *paired* reads, perhaps a pair-only setting for read features
|
||||||
|
*/
|
||||||
|
|
||||||
|
@ReadFilters({MaxInsertSizeFilter.class,MappingQualityReadFilter.class})
|
||||||
|
public class RFAWalker extends ReadWalker<SAMRecord,ReadFeatureWindow> {
|
||||||
|
// todo -- this needs to be an argument collection that can get passed around to initialize read features etc
|
||||||
|
@ArgumentCollection
|
||||||
|
private RFAArgumentCollection collection = new RFAArgumentCollection();
|
||||||
|
|
||||||
|
@Output
|
||||||
|
public PrintStream out;
|
||||||
|
|
||||||
|
Map<String,Boolean> caseStatus;
|
||||||
|
|
||||||
|
protected List<ReadFeatureAggregator> aggregators; // no re-instantiation, use a list to ensure ordering
|
||||||
|
|
||||||
|
protected GenomeLoc loc;
|
||||||
|
protected String sample;
|
||||||
|
|
||||||
|
public void initialize() {
|
||||||
|
if ( collection.windowSize % collection.windowJump != 0 ) {
|
||||||
|
throw new UserException("Window size is not divisible by window jump.");
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( collection.caseFile == null || collection.controlFile == null ) {
|
||||||
|
throw new UserException("You must provide both a case file (-case) and a control file (-control) each listing those samples belonging to the cohort");
|
||||||
|
}
|
||||||
|
|
||||||
|
caseStatus = new HashMap<String,Boolean>(getToolkit().getSAMFileSamples().size());
|
||||||
|
try {
|
||||||
|
for ( String sample : new XReadLines(collection.caseFile) ) {
|
||||||
|
caseStatus.put(sample,true);
|
||||||
|
}
|
||||||
|
for ( String sample : new XReadLines(collection.controlFile)) {
|
||||||
|
caseStatus.put(sample,false);
|
||||||
|
}
|
||||||
|
|
||||||
|
for ( Sample sample : getToolkit().getSAMFileSamples() ) {
|
||||||
|
if ( ! caseStatus.containsKey(sample.getId())) {
|
||||||
|
throw new UserException("No case/control status for sample "+sample.getId());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} catch ( FileNotFoundException e ) {
|
||||||
|
throw new UserException("Unable to open a case/control file",e);
|
||||||
|
}
|
||||||
|
|
||||||
|
Set<Class<? extends ReadFeatureAggregator>> aggregatorSet = getFeatureAggregators(collection.inputFeatures);
|
||||||
|
Set<ReadFeatureAggregator> rfHolder1 = new HashSet<ReadFeatureAggregator>(aggregatorSet.size());
|
||||||
|
try {
|
||||||
|
for ( Class<? extends ReadFeatureAggregator> featureClass : aggregatorSet ) {
|
||||||
|
ReadFeatureAggregator readFeature = featureClass.getConstructor(RFAArgumentCollection.class).newInstance(collection);
|
||||||
|
rfHolder1.add(readFeature);
|
||||||
|
}
|
||||||
|
} catch ( Exception e ) {
|
||||||
|
throw new StingException("A read feature instantiation error occurred during initialization",e);
|
||||||
|
}
|
||||||
|
|
||||||
|
ReadFeatureAggregator[] rfHolder2 = new ReadFeatureAggregator[rfHolder1.size()];
|
||||||
|
int idx = 0;
|
||||||
|
for ( ReadFeatureAggregator f : rfHolder1 ) {
|
||||||
|
rfHolder2[idx++] = f;
|
||||||
|
}
|
||||||
|
Arrays.sort(rfHolder2, new Comparator<ReadFeatureAggregator>() {
|
||||||
|
@Override
|
||||||
|
public int compare(ReadFeatureAggregator a, ReadFeatureAggregator b) {
|
||||||
|
return a.getClass().getSimpleName().compareTo(b.getClass().getSimpleName());
|
||||||
|
}
|
||||||
|
});
|
||||||
|
aggregators = Arrays.asList(rfHolder2);
|
||||||
|
|
||||||
|
writeHeader();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public ReadFeatureWindow reduceInit() {
|
||||||
|
Set<String> samples = new HashSet<String>(getToolkit().getSamples().size());
|
||||||
|
for ( Sample s : getToolkit().getSamples() ) {
|
||||||
|
samples.add(s.getId());
|
||||||
|
}
|
||||||
|
return new ReadFeatureWindow(aggregators,caseStatus,collection);
|
||||||
|
}
|
||||||
|
|
||||||
|
public SAMRecord map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
|
||||||
|
if ( ref == null ) { return null; } // unmapped reads have null ref contexts
|
||||||
|
loc = getToolkit().getGenomeLocParser().createGenomeLoc(ref.getLocus().getContig(),read.getAlignmentStart());
|
||||||
|
sample = read.getReadGroup().getSample();
|
||||||
|
return read;
|
||||||
|
}
|
||||||
|
|
||||||
|
public ReadFeatureWindow reduce(SAMRecord read, ReadFeatureWindow prevReduce) {
|
||||||
|
if ( read != null ) {
|
||||||
|
List<Map<String,List<ReadFeatureAggregator>>> completed = prevReduce.inc(read,loc,sample);
|
||||||
|
// todo -- run tests here; for now just log that a window/multiple windows are complete
|
||||||
|
if ( completed.size() > 0 ) {
|
||||||
|
// System.out.printf("At %s we have seen %d completed windows%n",loc,completed.size())
|
||||||
|
// bed format
|
||||||
|
int locShift = 0;
|
||||||
|
for ( Map<String,List<ReadFeatureAggregator>> samWindow : completed ) {
|
||||||
|
GenomeLoc window = getToolkit().getGenomeLocParser().createGenomeLoc(loc.getContig(),loc.getStart()-52+locShift,loc.getStop()-1);
|
||||||
|
runWindowTests(samWindow, window);
|
||||||
|
locShift += collection.windowJump;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return prevReduce;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Set<Class<? extends ReadFeatureAggregator>> getFeatureAggregators(List<String> requestedFeatures) {
|
||||||
|
HashSet<Class<? extends ReadFeatureAggregator>> newFeatureSet = new HashSet<Class<? extends ReadFeatureAggregator>>();
|
||||||
|
List<Class<? extends ReadFeatureAggregator>> availableFeatures = new PluginManager<ReadFeatureAggregator>(ReadFeatureAggregator.class).getPlugins();
|
||||||
|
|
||||||
|
if ( collection.inputFeatures == null ) {
|
||||||
|
newFeatureSet.addAll(availableFeatures);
|
||||||
|
return newFeatureSet;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Map<String,Class<? extends ReadFeatureAggregator>> classNameToClass = new HashMap<String,Class<? extends ReadFeatureAggregator>>(collection.inputFeatures.size());
|
||||||
|
for ( Class<? extends ReadFeatureAggregator> clazz : availableFeatures ) {
|
||||||
|
classNameToClass.put(clazz.getSimpleName(),clazz);
|
||||||
|
}
|
||||||
|
|
||||||
|
for ( String s : requestedFeatures) {
|
||||||
|
if ( classNameToClass.containsKey(s) ) {
|
||||||
|
newFeatureSet.add(classNameToClass.get(s));
|
||||||
|
} else {
|
||||||
|
throw new UserException("The name "+s+" does not correspond to an available read feature class.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return newFeatureSet;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void runWindowTests(Map<String,List<ReadFeatureAggregator>> window, GenomeLoc loc) {
|
||||||
|
// two main tests: fixed-significance shift, and confidence-interval sum
|
||||||
|
//System.out.printf("Running tests...%n");
|
||||||
|
out.printf("%s\t%d\t%d",loc.getContig(),loc.getStart(),loc.getStop());
|
||||||
|
for ( int agIdx = 0; agIdx < aggregators.size(); agIdx ++ ) {
|
||||||
|
double fixedDelta = fixedSignificance(window.get("case").get(agIdx),window.get("control").get(agIdx));
|
||||||
|
double weightedDelta = confidenceIntervalSum(window,agIdx);
|
||||||
|
out.printf("\t%.2e\t%.2e",Double.isNaN(fixedDelta) ? 0.0 : fixedDelta,weightedDelta);
|
||||||
|
}
|
||||||
|
out.printf("%n");
|
||||||
|
}
|
||||||
|
|
||||||
|
public double fixedSignificance(ReadFeatureAggregator caseAg, ReadFeatureAggregator controlAg) {
|
||||||
|
if ( caseAg.getnReads() == 0 || controlAg.getnReads() == 0 ) {
|
||||||
|
return 0.0;
|
||||||
|
}
|
||||||
|
double stat_num = caseAg.getMean() - controlAg.getMean();
|
||||||
|
double stat_denom = Math.sqrt(caseAg.getUnbiasedVar()/caseAg.getnReads() + controlAg.getUnbiasedVar()/controlAg.getnReads());
|
||||||
|
double stat = stat_num/stat_denom;
|
||||||
|
//System.out.printf("Mean_dif: %.2e Var: %.2e Stat: %.2f Z: %.2f SS-ZZ: %.2f%n",stat_num,stat_denom,stat, collection.fixedZ, stat*stat-collection.fixedZ*collection.fixedZ);
|
||||||
|
if ( stat*stat < collection.fixedZ*collection.fixedZ ) {
|
||||||
|
return 0.0;
|
||||||
|
} else {
|
||||||
|
//System.out.printf("Calculating delta: %.2f%n",(stat < 0) ? stat_denom*(-1*collection.fixedZ-stat) : stat_denom*(stat-collection.fixedZ));
|
||||||
|
//return (stat > 0) ? stat_denom*(stat-collection.fixedZ) : stat_denom*(stat+collection.fixedZ);
|
||||||
|
return stat;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public double confidenceIntervalSum(Map<String,List<ReadFeatureAggregator>> window, int offset) {
|
||||||
|
// this comment serves as an explicit normality assumption (e.g. that the DF will be large)
|
||||||
|
double caseWeightMean = 0.0;
|
||||||
|
double caseWeightVar = 0.0;
|
||||||
|
double caseSumWeight = 0.0;
|
||||||
|
double controlWeightMean = 0.0;
|
||||||
|
double controlWeightVar = 0.0;
|
||||||
|
double controlSumWeight = 0.0;
|
||||||
|
for ( Map.Entry<String,List<ReadFeatureAggregator>> sampleEntry : window.entrySet() ) {
|
||||||
|
if ( ! sampleEntry.getKey().equals("case") && ! sampleEntry.getKey().equals("control") ) {
|
||||||
|
// first check if the sample is shifted from zero (CI does not include zero)
|
||||||
|
// todo -- fixme. This will always be true for insert sizes, clipped reads, mapping quality...should have an avg value
|
||||||
|
|
||||||
|
ReadFeatureAggregator aggregator = sampleEntry.getValue().get(offset);
|
||||||
|
if ( aggregator.getnReads() == 0 ) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean shifted;
|
||||||
|
int shiftThresh = 0;
|
||||||
|
// todo -- this is fucking awful
|
||||||
|
if ( aggregator instanceof InsertSize ) {
|
||||||
|
shiftThresh = 150;
|
||||||
|
}
|
||||||
|
if ( aggregator instanceof ClippedBases ) {
|
||||||
|
shiftThresh = 6;
|
||||||
|
}
|
||||||
|
if ( aggregator.getMean() < shiftThresh ) {
|
||||||
|
// use fixedZ/4 to be a little bit less strenuous -- todo -- make an input?
|
||||||
|
shifted = aggregator.getMean() + collection.fixedZ/4*Math.sqrt(aggregator.getUnbiasedVar())/aggregator.getnReads() < shiftThresh;
|
||||||
|
} else {
|
||||||
|
shifted = aggregator.getMean() - collection.fixedZ/4*Math.sqrt(aggregator.getUnbiasedVar())/aggregator.getnReads() > shiftThresh;;
|
||||||
|
}
|
||||||
|
if ( shifted ) {
|
||||||
|
double twoS2 = 2*aggregator.getUnbiasedVar()*aggregator.getUnbiasedVar();
|
||||||
|
if ( caseStatus.get(sampleEntry.getKey())) {
|
||||||
|
caseWeightMean += (aggregator.getnReads()-1.0)*aggregator.getMean()/(twoS2);
|
||||||
|
caseWeightVar += aggregator.getUnbiasedVar()*Math.pow((aggregator.getnReads()-1.0)/(twoS2),2);
|
||||||
|
caseSumWeight += (aggregator.getnReads()-1.0)/(twoS2);
|
||||||
|
} else {
|
||||||
|
controlWeightMean += (aggregator.getnReads()-1.0)*aggregator.getMean()/(twoS2);
|
||||||
|
controlWeightVar += aggregator.getUnbiasedVar()*Math.pow((aggregator.getnReads()-1.0)/(twoS2),2);
|
||||||
|
controlSumWeight += (aggregator.getnReads()-1.0)/(twoS2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
double caseGaussianMean = caseWeightMean/caseSumWeight;
|
||||||
|
double controlGaussianMean = controlWeightMean/controlSumWeight;
|
||||||
|
double caseGaussianVar = caseWeightVar/(caseSumWeight*caseSumWeight);
|
||||||
|
double controlGaussianVar = controlWeightVar/(controlSumWeight*controlSumWeight);
|
||||||
|
// todo -- is the z-factor an appropriate statistic?
|
||||||
|
//return 1.0 - 3*(caseGaussianVar+controlGaussianVar)/Math.abs(caseGaussianMean-controlGaussianMean);
|
||||||
|
if ( caseGaussianMean > controlGaussianMean ) {
|
||||||
|
// want to examine the case lower fixedZ*stdev vs the control upper fixedZ*stev
|
||||||
|
return (caseGaussianMean-collection.fixedZ/4*Math.sqrt(caseGaussianVar)) - (controlGaussianMean + collection.fixedZ/4*Math.sqrt(controlGaussianVar));
|
||||||
|
} else {
|
||||||
|
// want to examine the case upper fixedZ*stev vs the control lower fixedZ*stev
|
||||||
|
return (controlGaussianMean-collection.fixedZ/4*Math.sqrt(controlGaussianVar)) - ( caseGaussianMean + collection.fixedZ/4*Math.sqrt(caseGaussianVar));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void writeHeader() {
|
||||||
|
StringBuffer buf = new StringBuffer();
|
||||||
|
buf.append("description=chr,start,stop");
|
||||||
|
for ( ReadFeatureAggregator f : aggregators ) {
|
||||||
|
buf.append(",");
|
||||||
|
buf.append(f.getClass().getSimpleName());
|
||||||
|
buf.append("-d,");
|
||||||
|
buf.append(f.getClass().getSimpleName());
|
||||||
|
buf.append("-c");
|
||||||
|
}
|
||||||
|
out.printf("track type=bedTable %s%n",buf);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,105 @@
|
||||||
|
package org.broadinstitute.sting.oneoffprojects.walkers.newassociation;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.commandline.Output;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.features.table.TableCodec;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.features.table.TableFeature;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||||
|
|
||||||
|
import java.io.PrintStream;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: chartl
|
||||||
|
* Date: 5/23/11
|
||||||
|
* Time: 5:22 PM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
public class RFCombineWalker extends RodWalker<Object,Object> {
|
||||||
|
|
||||||
|
private static final String FIRST_COL = "chrm:start-stop";
|
||||||
|
|
||||||
|
@Output
|
||||||
|
PrintStream out;
|
||||||
|
|
||||||
|
private List<String> order;
|
||||||
|
|
||||||
|
private GenomeLoc prevLoc;
|
||||||
|
|
||||||
|
public void initialize() {
|
||||||
|
order = new ArrayList<String>(getToolkit().getRodDataSources().size());
|
||||||
|
StringBuffer header = new StringBuffer();
|
||||||
|
header.append(FIRST_COL);
|
||||||
|
for ( ReferenceOrderedDataSource rSource : getToolkit().getRodDataSources() ) {
|
||||||
|
if ( rSource.getRecordType().isAssignableFrom(TableFeature.class) ) {
|
||||||
|
//System.out.println(rSource.getHeader().toString());
|
||||||
|
for ( String entry : (Collection<String>) rSource.getHeader() ) {
|
||||||
|
if ( ! entry.startsWith("HEADER") ) {
|
||||||
|
header.append("\t");
|
||||||
|
header.append(entry);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
order.add(rSource.getName());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
out.printf("%s%n",header);
|
||||||
|
|
||||||
|
prevLoc = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Object reduceInit() { return null; }
|
||||||
|
|
||||||
|
public Object map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||||
|
if ( tracker == null || ref == null ) { return null;}
|
||||||
|
|
||||||
|
GenomeLoc loc = null;
|
||||||
|
boolean needPrint = false;
|
||||||
|
List<String> eventBySample = new ArrayList<String>();
|
||||||
|
|
||||||
|
for ( String rodName : order ) {
|
||||||
|
List<Object> namedMD = tracker.getReferenceMetaData(rodName,true);
|
||||||
|
TableFeature feature = null;
|
||||||
|
if ( namedMD.size() > 0 ) {
|
||||||
|
feature = namedMD.get(0) instanceof TableFeature ? (TableFeature) namedMD.get(0) : null;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( feature == null ) { throw new StingException("This should be an instance of TableFeature, no?"); }
|
||||||
|
|
||||||
|
loc = feature.getLocation();
|
||||||
|
if ( prevLoc != null && loc.equals(prevLoc) ) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
for ( String s : feature.getAllValues().subList(1,feature.getAllValues().size()) ) {
|
||||||
|
boolean has = ! (s.charAt(0) == '0');
|
||||||
|
eventBySample.add(s);
|
||||||
|
needPrint |= has;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( needPrint && (loc != null)) {
|
||||||
|
out.printf("%s",loc.toString());
|
||||||
|
for ( String s : eventBySample ) {
|
||||||
|
out.printf("\t%s", s);
|
||||||
|
}
|
||||||
|
out.printf("%n");
|
||||||
|
}
|
||||||
|
|
||||||
|
prevLoc = loc;
|
||||||
|
|
||||||
|
return null;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public Object reduce(Object map, Object reduce) { return null; }
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,126 @@
|
||||||
|
package org.broadinstitute.sting.oneoffprojects.walkers.newassociation;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broadinstitute.sting.commandline.ArgumentCollection;
|
||||||
|
import org.broadinstitute.sting.commandline.Output;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
import org.broadinstitute.sting.gatk.datasources.sample.Sample;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||||
|
import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features.*;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.classloader.PluginManager;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
|
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||||
|
|
||||||
|
import java.io.FileNotFoundException;
|
||||||
|
import java.io.PrintStream;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A (currently very lame) utility to dump read feature information to a file for validity checking of read feature extractors.
|
||||||
|
* Soon to be extended for aggregator and window validation; as well as postspective signal analysis.
|
||||||
|
*/
|
||||||
|
public class RFDumperWalker extends ReadWalker<String[],ReadFeatureWindow> {
|
||||||
|
@ArgumentCollection
|
||||||
|
private RFAArgumentCollection collection = new RFAArgumentCollection();
|
||||||
|
|
||||||
|
List<ReadFeatureAggregator> aggregators;
|
||||||
|
GenomeLoc loc;
|
||||||
|
boolean paired;
|
||||||
|
String sample;
|
||||||
|
String name;
|
||||||
|
|
||||||
|
String[] data;
|
||||||
|
|
||||||
|
@Output
|
||||||
|
PrintStream out;
|
||||||
|
|
||||||
|
public void initialize() {
|
||||||
|
|
||||||
|
Set<Class<? extends ReadFeatureAggregator>> aggregatorSet = getFeatureAggregators(collection.inputFeatures);
|
||||||
|
Set<ReadFeatureAggregator> rfHolder1 = new HashSet<ReadFeatureAggregator>(aggregatorSet.size());
|
||||||
|
try {
|
||||||
|
for ( Class<? extends ReadFeatureAggregator> featureClass : aggregatorSet ) {
|
||||||
|
ReadFeatureAggregator readFeature = featureClass.getConstructor(collection.getClass()).newInstance(collection);
|
||||||
|
rfHolder1.add(readFeature);
|
||||||
|
}
|
||||||
|
} catch ( Exception e ) {
|
||||||
|
throw new StingException("A read feature instantiation error occurred during initialization",e);
|
||||||
|
}
|
||||||
|
|
||||||
|
ReadFeatureAggregator[] rfHolder2 = new ReadFeatureAggregator[rfHolder1.size()];
|
||||||
|
int idx = 0;
|
||||||
|
for ( ReadFeatureAggregator f : rfHolder1 ) {
|
||||||
|
rfHolder2[idx++] = f;
|
||||||
|
}
|
||||||
|
Arrays.sort(rfHolder2, new Comparator<ReadFeatureAggregator>() {
|
||||||
|
@Override
|
||||||
|
public int compare(ReadFeatureAggregator a, ReadFeatureAggregator b) {
|
||||||
|
return a.getClass().getSimpleName().compareTo(b.getClass().getSimpleName());
|
||||||
|
}
|
||||||
|
});
|
||||||
|
aggregators = Arrays.asList(rfHolder2);
|
||||||
|
for ( ReadFeatureAggregator ag : aggregators ) {
|
||||||
|
logger.info(ag.getClass().getSimpleName());
|
||||||
|
}
|
||||||
|
|
||||||
|
data = new String[aggregators.size()];
|
||||||
|
}
|
||||||
|
|
||||||
|
public Set<Class<? extends ReadFeatureAggregator>> getFeatureAggregators(List<String> requestedFeatures) {
|
||||||
|
HashSet<Class<? extends ReadFeatureAggregator>> newFeatureSet = new HashSet<Class<? extends ReadFeatureAggregator>>();
|
||||||
|
List<Class<? extends ReadFeatureAggregator>> availableFeatures = new PluginManager<ReadFeatureAggregator>(ReadFeatureAggregator.class).getPlugins();
|
||||||
|
|
||||||
|
if ( collection.inputFeatures == null ) {
|
||||||
|
newFeatureSet.addAll(availableFeatures);
|
||||||
|
return newFeatureSet;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Map<String,Class<? extends ReadFeatureAggregator>> classNameToClass = new HashMap<String,Class<? extends ReadFeatureAggregator>>(collection.inputFeatures.size());
|
||||||
|
for ( Class<? extends ReadFeatureAggregator> clazz : availableFeatures ) {
|
||||||
|
classNameToClass.put(clazz.getSimpleName(),clazz);
|
||||||
|
}
|
||||||
|
|
||||||
|
for ( String s : requestedFeatures) {
|
||||||
|
if ( classNameToClass.containsKey(s) ) {
|
||||||
|
newFeatureSet.add(classNameToClass.get(s));
|
||||||
|
} else {
|
||||||
|
throw new UserException("The name "+s+" does not correspond to an available read feature class.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return newFeatureSet;
|
||||||
|
}
|
||||||
|
|
||||||
|
public ReadFeatureWindow reduceInit() { return null; }
|
||||||
|
|
||||||
|
public String[] map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
|
||||||
|
// TODO: THIS WILL BREAK IF FEATURE REQUIRES PAIRED READ
|
||||||
|
if ( ref == null ) { return null; } // unmapped reads have null ref contexts
|
||||||
|
loc = getToolkit().getGenomeLocParser().createGenomeLoc(ref.getLocus().getContig(),read.getAlignmentStart());
|
||||||
|
sample = read.getReadGroup().getSample();
|
||||||
|
name = read.getReadName();
|
||||||
|
paired = read.getReadPairedFlag() && ! read.getMateUnmappedFlag();
|
||||||
|
int idx = 0;
|
||||||
|
for ( ReadFeatureAggregator aggregator : aggregators) {
|
||||||
|
data[idx++] = String.format("%s: %s",aggregator.getClass().getSimpleName(),aggregator.parseStr(read));
|
||||||
|
}
|
||||||
|
return data;
|
||||||
|
}
|
||||||
|
|
||||||
|
public ReadFeatureWindow reduce(String[] map, ReadFeatureWindow prevReduce) {
|
||||||
|
if ( map == null ) { return null; }
|
||||||
|
StringBuffer fStrBuilder = new StringBuffer();
|
||||||
|
for ( String f : map ) {
|
||||||
|
fStrBuilder.append("\t");
|
||||||
|
fStrBuilder.append(f);
|
||||||
|
}
|
||||||
|
|
||||||
|
out.printf("%s\t%s\t%s%s%n",loc,name,sample,fStrBuilder);
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,243 @@
|
||||||
|
package org.broadinstitute.sting.oneoffprojects.walkers.newassociation;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broadinstitute.sting.commandline.Argument;
|
||||||
|
import org.broadinstitute.sting.commandline.ArgumentCollection;
|
||||||
|
import org.broadinstitute.sting.commandline.Output;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
import org.broadinstitute.sting.gatk.datasources.sample.Sample;
|
||||||
|
import org.broadinstitute.sting.gatk.filters.*;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.By;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.ReadFilters;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||||
|
import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features.ClippedBases;
|
||||||
|
import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features.InsertSize;
|
||||||
|
import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features.ReadFeatureAggregator;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.MathUtils;
|
||||||
|
import org.broadinstitute.sting.utils.classloader.PluginManager;
|
||||||
|
import org.broadinstitute.sting.utils.collections.Pair;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
|
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||||
|
|
||||||
|
import java.io.FileNotFoundException;
|
||||||
|
import java.io.PrintStream;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: chartl
|
||||||
|
* Date: 5/12/11
|
||||||
|
* Time: 1:48 PM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
@ReadFilters({MaxInsertSizeFilter.class,MappingQualityReadFilter.class,DuplicateReadFilter.class,FailsVendorQualityCheckReadFilter.class,NotPrimaryAlignmentReadFilter.class,UnmappedReadFilter.class})
|
||||||
|
@By(DataSource.REFERENCE)
|
||||||
|
public class RFExtractorWalker extends ReadWalker<SAMRecord,RFWindow> {
|
||||||
|
|
||||||
|
@ArgumentCollection
|
||||||
|
public RFAArgumentCollection rfaArgs = new RFAArgumentCollection();
|
||||||
|
|
||||||
|
@Argument(doc="Set the marker threshold to this.",shortName="mmt",fullName="markerModeThreshold",required=false)
|
||||||
|
public double markerModeThreshold = 0.05;
|
||||||
|
|
||||||
|
@Argument(doc="Turn on marker mode (1 if sample is significantly greater than the threshold, 0 otherwise)",shortName="mm",fullName="markerMode",required=false)
|
||||||
|
public boolean markerMode = false;
|
||||||
|
|
||||||
|
@Argument(doc="Turn on raw count mode: output will be raw aberrant read counts",shortName="c",fullName="count",required=false)
|
||||||
|
public boolean countMode = false;
|
||||||
|
|
||||||
|
|
||||||
|
@Output
|
||||||
|
public PrintStream out;
|
||||||
|
|
||||||
|
protected Iterator<GenomeLoc> locusIterator;
|
||||||
|
protected GenomeLoc iteratorLoc;
|
||||||
|
protected GenomeLoc loc;
|
||||||
|
protected String sample;
|
||||||
|
|
||||||
|
public void initialize() {
|
||||||
|
if ( rfaArgs.windowSize % rfaArgs.windowJump != 0 ) {
|
||||||
|
throw new UserException("Window size is not divisible by window jump.");
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( markerMode && markerModeThreshold < 0.0 ) {
|
||||||
|
throw new UserException("Cannot have a negative threshold when using marker mode");
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( countMode && markerMode ) {
|
||||||
|
throw new UserException("Cannot be both in count mode and marker mode");
|
||||||
|
}
|
||||||
|
|
||||||
|
locusIterator = getToolkit().getIntervals().iterator();
|
||||||
|
iteratorLoc = locusIterator.next();
|
||||||
|
}
|
||||||
|
|
||||||
|
public RFWindow reduceInit() {
|
||||||
|
Map <String,Boolean> allCase = new HashMap<String,Boolean>(getToolkit().getSamples().size());
|
||||||
|
for ( Sample s : getToolkit().getSAMFileSamples() ) {
|
||||||
|
allCase.put(s.getId(),true);
|
||||||
|
if ( s.getId() == null || s.getId().equals("null") ) {
|
||||||
|
throw new StingException("Sample IDs must not be null... " + s.toString() + " " + Boolean.toString(s.hasSAMFileEntry()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Set<Class<? extends ReadFeatureAggregator>> aggregatorSet = getFeatureAggregators(rfaArgs.inputFeatures);
|
||||||
|
Set<ReadFeatureAggregator> rfHolder1 = new HashSet<ReadFeatureAggregator>(aggregatorSet.size());
|
||||||
|
try {
|
||||||
|
for ( Class<? extends ReadFeatureAggregator> featureClass : aggregatorSet ) {
|
||||||
|
ReadFeatureAggregator readFeature = featureClass.getConstructor(RFAArgumentCollection.class).newInstance(rfaArgs);
|
||||||
|
rfHolder1.add(readFeature);
|
||||||
|
}
|
||||||
|
} catch ( Exception e ) {
|
||||||
|
throw new StingException("A read feature instantiation error occurred during initialization",e);
|
||||||
|
}
|
||||||
|
|
||||||
|
ReadFeatureAggregator[] rfHolder2 = new ReadFeatureAggregator[rfHolder1.size()];
|
||||||
|
int idx = 0;
|
||||||
|
for ( ReadFeatureAggregator f : rfHolder1 ) {
|
||||||
|
rfHolder2[idx++] = f;
|
||||||
|
}
|
||||||
|
Arrays.sort(rfHolder2, new Comparator<ReadFeatureAggregator>() {
|
||||||
|
@Override
|
||||||
|
public int compare(ReadFeatureAggregator a, ReadFeatureAggregator b) {
|
||||||
|
return a.getClass().getSimpleName().compareTo(b.getClass().getSimpleName());
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
List<ReadFeatureAggregator> aggregators = Arrays.asList(rfHolder2);
|
||||||
|
|
||||||
|
out.printf("HEADERchrm:start-stop");
|
||||||
|
for ( String s : allCase.keySet() ) {
|
||||||
|
for ( ReadFeatureAggregator rfa : aggregators ) {
|
||||||
|
out.printf("\t%s.%s",s,rfa.getClass().getSimpleName());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out.printf("%n");
|
||||||
|
|
||||||
|
return new RFWindow(aggregators,rfaArgs,allCase,getToolkit().getGenomeLocParser());
|
||||||
|
}
|
||||||
|
|
||||||
|
public SAMRecord map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
|
||||||
|
if ( ref == null ) { return null; } // unmapped reads have null ref contexts
|
||||||
|
//loc = getToolkit().getGenomeLocParser().createGenomeLoc(ref.getLocus().getContig(),read.getAlignmentStart());
|
||||||
|
GenomeLoc newLoc = ref.getLocus().getStartLocation(); // can be problematic if read aligns prior to start of contig -- should never happen
|
||||||
|
if ( newLoc.isPast(iteratorLoc.getStartLocation()) ) {
|
||||||
|
loc = newLoc;
|
||||||
|
} else {
|
||||||
|
loc = iteratorLoc.getStartLocation();
|
||||||
|
}
|
||||||
|
if ( read == null ) { return null; }
|
||||||
|
sample = read.getReadGroup().getSample();
|
||||||
|
return read;
|
||||||
|
}
|
||||||
|
|
||||||
|
public RFWindow reduce(SAMRecord read, RFWindow prevReduce) {
|
||||||
|
if ( iteratorLoc != null && iteratorLoc.isBefore(loc) ) {// test if read is past end of the user interval
|
||||||
|
//logger.info(String.format("iteratorLoc: %s loc: %s",iteratorLoc.toString(),loc.toString()));
|
||||||
|
onIntervalDone(prevReduce);
|
||||||
|
iteratorLoc = locusIterator.hasNext() ? locusIterator.next() : null;
|
||||||
|
if ( loc.startsBefore(iteratorLoc) ) {
|
||||||
|
loc = iteratorLoc.getStartLocation();
|
||||||
|
}
|
||||||
|
reduce(read,prevReduce);
|
||||||
|
} else if ( read != null ) {
|
||||||
|
// todo -- what happens if first read of an interval is not before or at the start of the interval?
|
||||||
|
|
||||||
|
List<Pair<GenomeLoc,Map<String,List<ReadFeatureAggregator>>>> completed = prevReduce.inc(read, loc, sample,iteratorLoc);
|
||||||
|
if ( completed.size() > 0 ) {
|
||||||
|
// System.out.printf("At %s we have seen %d completed windows%n",loc,completed.size())
|
||||||
|
// bed format
|
||||||
|
for ( Pair<GenomeLoc,Map<String,List<ReadFeatureAggregator>>> samWindow : completed ) {
|
||||||
|
GenomeLoc window = samWindow.first;
|
||||||
|
/*if ( prevPrint == null ) {
|
||||||
|
prevPrint = window;
|
||||||
|
} else if ( window.startsBefore(prevPrint) ) {
|
||||||
|
throw new StingException(String.format("Attempting to print at %s after having printed a record at %s",window.toString(),prevPrint.toString()));
|
||||||
|
} else {
|
||||||
|
prevPrint = window;
|
||||||
|
}*/
|
||||||
|
out.printf("%s",window.toString());
|
||||||
|
for ( Map.Entry<String,List<ReadFeatureAggregator>> samEntry : samWindow.second.entrySet() ) {
|
||||||
|
for ( ReadFeatureAggregator aggregator : samEntry.getValue() ) {
|
||||||
|
if ( ! markerMode && ! countMode ) {
|
||||||
|
out.printf("\t%.5e,%d",aggregator.getMean(),aggregator.getnReads());
|
||||||
|
} else if ( markerMode ) {
|
||||||
|
out.printf("\t%d",hasEvent(aggregator,markerModeThreshold,rfaArgs.fixedZ) ? 1 : 0);
|
||||||
|
} else if ( countMode ) {
|
||||||
|
out.printf("\t%d", MathUtils.fastRound(aggregator.getMean()*aggregator.getnReads()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out.printf("%n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
prevReduce.inc(null,loc,null,iteratorLoc);
|
||||||
|
}
|
||||||
|
return prevReduce;
|
||||||
|
}
|
||||||
|
|
||||||
|
public RFWindow onIntervalDone(RFWindow rWindow) {
|
||||||
|
//logger.info("In onIntervalDone at genome loc "+iteratorLoc.toString()+" with read loc "+loc.toString());
|
||||||
|
List<Pair<GenomeLoc,Map<String,List<ReadFeatureAggregator>>>> completed = rWindow.flush(iteratorLoc);
|
||||||
|
for ( Pair<GenomeLoc,Map<String,List<ReadFeatureAggregator>>> samWindow : completed ) {
|
||||||
|
GenomeLoc window = samWindow.first;
|
||||||
|
/*if ( prevPrint == null ) {
|
||||||
|
prevPrint = window;
|
||||||
|
} else if ( window.startsBefore(prevPrint) ) {
|
||||||
|
throw new StingException(String.format("Attempting to print at %s after having printed a record at %s",window.toString(),prevPrint.toString()));
|
||||||
|
} else {
|
||||||
|
prevPrint = window;
|
||||||
|
}*/
|
||||||
|
out.printf("%s",window.toString());
|
||||||
|
for ( Map.Entry<String,List<ReadFeatureAggregator>> samEntry : samWindow.second.entrySet() ) {
|
||||||
|
for ( ReadFeatureAggregator aggregator : samEntry.getValue() ) {
|
||||||
|
if ( ! markerMode && ! countMode ) {
|
||||||
|
out.printf("\t%.5e,%d",aggregator.getMean(),aggregator.getnReads());
|
||||||
|
} else if ( markerMode ) {
|
||||||
|
out.printf("\t%d",hasEvent(aggregator,markerModeThreshold,rfaArgs.fixedZ) ? 1 : 0);
|
||||||
|
} else if ( countMode ) {
|
||||||
|
out.printf("\t%d", MathUtils.fastRound(aggregator.getMean()*aggregator.getnReads()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out.printf("%n");
|
||||||
|
}
|
||||||
|
|
||||||
|
return rWindow;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Set<Class<? extends ReadFeatureAggregator>> getFeatureAggregators(List<String> requestedFeatures) {
|
||||||
|
HashSet<Class<? extends ReadFeatureAggregator>> newFeatureSet = new HashSet<Class<? extends ReadFeatureAggregator>>();
|
||||||
|
List<Class<? extends ReadFeatureAggregator>> availableFeatures = new PluginManager<ReadFeatureAggregator>(ReadFeatureAggregator.class).getPlugins();
|
||||||
|
|
||||||
|
if ( rfaArgs.inputFeatures == null ) {
|
||||||
|
newFeatureSet.addAll(availableFeatures);
|
||||||
|
return newFeatureSet;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Map<String,Class<? extends ReadFeatureAggregator>> classNameToClass = new HashMap<String,Class<? extends ReadFeatureAggregator>>(rfaArgs.inputFeatures.size());
|
||||||
|
for ( Class<? extends ReadFeatureAggregator> clazz : availableFeatures ) {
|
||||||
|
classNameToClass.put(clazz.getSimpleName(),clazz);
|
||||||
|
}
|
||||||
|
|
||||||
|
for ( String s : requestedFeatures) {
|
||||||
|
if ( classNameToClass.containsKey(s) ) {
|
||||||
|
newFeatureSet.add(classNameToClass.get(s));
|
||||||
|
} else {
|
||||||
|
throw new UserException("The name "+s+" does not correspond to an available read feature class.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return newFeatureSet;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static boolean hasEvent(ReadFeatureAggregator aggregator, double lowThresh, double sigLevel) {
|
||||||
|
return (aggregator.getMean() - lowThresh)*Math.sqrt(aggregator.getnReads())/aggregator.getVar() > sigLevel;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,243 @@
|
||||||
|
package org.broadinstitute.sting.oneoffprojects.walkers.newassociation;
|
||||||
|
|
||||||
|
import com.google.java.contract.Requires;
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features.ReadFeatureAggregator;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||||
|
import org.broadinstitute.sting.utils.collections.Pair;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: chartl
|
||||||
|
*/
|
||||||
|
public class RFWindow {
|
||||||
|
|
||||||
|
private RFAArgumentCollection argumentCollection; // the RF Argument Collection from the RF Walker
|
||||||
|
private List<GenomeLoc> windowStarts; // holds the starting positions of the windows
|
||||||
|
private Map<String,Boolean> sampleCohortMap; // identifies case samples ("true") and control samples ("false")
|
||||||
|
List<ReadFeatureAggregator> aggregators; // read feature aggregators to be used (maintained for fast cloning, and ordering)
|
||||||
|
// feature ---> sample ---> count
|
||||||
|
List<Map<String,List<ReadFeatureAggregator>>> aggregatorWindows; // holds the map between samples and features in the current windows
|
||||||
|
private GenomeLocParser parser; // the active parser, for creating GenomeLocs for empty windows
|
||||||
|
private GenomeLoc previousLoc; // the previous genome loc within the user interval given to the RFWindow
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Defines a new window which maps samples to read feature aggregators
|
||||||
|
* @return - a new element for aggregatorWindows
|
||||||
|
*/
|
||||||
|
private Map<String,List<ReadFeatureAggregator>> newWindow() {
|
||||||
|
Map<String,List<ReadFeatureAggregator>> win = new HashMap<String,List<ReadFeatureAggregator>>(sampleCohortMap.size());
|
||||||
|
for ( String s : sampleCohortMap.keySet() ) {
|
||||||
|
win.put(s,getAggregators());
|
||||||
|
}
|
||||||
|
// todo -- generalize me
|
||||||
|
win.put("case",getAggregators());
|
||||||
|
win.put("control",getAggregators());
|
||||||
|
|
||||||
|
return win;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generates a list of new aggregators to collect data
|
||||||
|
* @return list of ReadFeatureAggregators to be the value of a new window
|
||||||
|
*/
|
||||||
|
private List<ReadFeatureAggregator> getAggregators() {
|
||||||
|
ArrayList<ReadFeatureAggregator> newEmptyAgs = new ArrayList<ReadFeatureAggregator>(aggregators.size());
|
||||||
|
try {
|
||||||
|
for ( ReadFeatureAggregator ag : aggregators ) {
|
||||||
|
newEmptyAgs.add(ag.getClass().getConstructor(RFAArgumentCollection.class).newInstance(argumentCollection));
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new StingException("Error instantiating read feature aggregator",e);
|
||||||
|
}
|
||||||
|
|
||||||
|
return newEmptyAgs;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A constructor for the RFWindow object
|
||||||
|
* @param collection - the argument collection, from the walker
|
||||||
|
* @param cohortMap - the map between sample IDs and whether or not it is a case sample, from the walker
|
||||||
|
* @param parser - the Genome Loc Parser, from the walker
|
||||||
|
*/
|
||||||
|
public RFWindow(List<ReadFeatureAggregator> aggregators, RFAArgumentCollection collection, Map<String,Boolean> cohortMap, GenomeLocParser parser) {
|
||||||
|
this.argumentCollection = collection;
|
||||||
|
this.sampleCohortMap = cohortMap;
|
||||||
|
this.parser = parser;
|
||||||
|
this.aggregators = aggregators;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Instantiate the tiled windows of sample -> List<aggregator> maps, filling in empty windows up to the one which contains the
|
||||||
|
* provided genomeloc if necessary.
|
||||||
|
* @param loc - the location to fill up to, usually the starting position of a read or an interval
|
||||||
|
* @param currentUserInterval - the current user-provided interval being processed by the walker
|
||||||
|
*/
|
||||||
|
@Requires({"! currentUserInterval.isBefore(loc)"})
|
||||||
|
public void instantiate(GenomeLoc loc, GenomeLoc currentUserInterval) {
|
||||||
|
aggregatorWindows = new ArrayList<Map<String,List<ReadFeatureAggregator>>>(argumentCollection.windowSize/argumentCollection.windowJump);
|
||||||
|
windowStarts = new ArrayList<GenomeLoc>(argumentCollection.windowSize/argumentCollection.windowJump);
|
||||||
|
//System.out.printf("Calling fill at %s\t%s%n",loc,currentUserInterval);
|
||||||
|
this.fill(loc, currentUserInterval);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fills the tiled window lists with empty windows up to the one which includes the locus
|
||||||
|
* todo -- this can take a lot of memory for large intervals instantiated far into them, e.g.
|
||||||
|
* |---------------------------------------------------------------------| interval
|
||||||
|
* . <=== locus
|
||||||
|
* todo -- perhaps a reduced representation for empty windows that were not created but already have expired?
|
||||||
|
* @param loc - the location to fill up to, usually the starting position of a read or an interval
|
||||||
|
* @param currentUserInterval - the current user-provided interval being processed by the walker
|
||||||
|
*/
|
||||||
|
@Requires({"! currentUserInterval.isBefore(loc)"})
|
||||||
|
public void fill(GenomeLoc loc, GenomeLoc currentUserInterval) {
|
||||||
|
// case -1: window could be empty
|
||||||
|
if ( windowStarts == null ) {
|
||||||
|
instantiate(loc,currentUserInterval);
|
||||||
|
}
|
||||||
|
|
||||||
|
// case 0: if the windows are empty add in the beginning of the interval
|
||||||
|
if ( windowStarts.size() == 0 ) {
|
||||||
|
windowStarts.add(currentUserInterval.getStartLocation());
|
||||||
|
aggregatorWindows.add(newWindow());
|
||||||
|
}
|
||||||
|
|
||||||
|
// case 1: loc is before or within windowJump bases of the current user interval; we need only instantiate the first window
|
||||||
|
if ( loc.isBefore(currentUserInterval) || loc.distance(currentUserInterval) <= argumentCollection.windowJump ) {
|
||||||
|
// do nothing at all
|
||||||
|
} else {
|
||||||
|
// case 2: loc is somewhere in the middle or at the end of current user interval, need to fill in windows up until then
|
||||||
|
GenomeLoc nextLoc = windowStarts.get(windowStarts.size()-1);
|
||||||
|
while ( loc.distance(nextLoc) > argumentCollection.windowJump ) {
|
||||||
|
//System.out.printf("Filling with nextloc %s%n",nextLoc);
|
||||||
|
nextLoc = shiftLocByJump(windowStarts.get(windowStarts.size()-1),argumentCollection.windowJump);
|
||||||
|
windowStarts.add(nextLoc);
|
||||||
|
aggregatorWindows.add(newWindow());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Shifts a location from chrZ:X to chrZ:X+jump
|
||||||
|
* @param loc - location to shift
|
||||||
|
* @param jump - amount to shift by
|
||||||
|
* @return loc with start position shifted by jump
|
||||||
|
*/
|
||||||
|
@Requires("loc.getStart()+jump < parser.getContigInfo(loc.getContig()).getSequenceLength()") // e.g. that the new loc is not off the end of the contig
|
||||||
|
private GenomeLoc shiftLocByJump(GenomeLoc loc, int jump) {
|
||||||
|
return parser.createGenomeLoc(loc.getContig(),loc.getStart()+jump);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fills in missing windows between the previously-seen one and the ending interval, then expires all windows, returning
|
||||||
|
* them as complete, resetting previousLocation to null so that the next read re-instantiates internal window data
|
||||||
|
* @param userIntervalEnding - the user interval which is ending
|
||||||
|
* @return those currently active windows, plus empty interpolating windows between the last active window and the end of the interval
|
||||||
|
*/
|
||||||
|
public List<Pair<GenomeLoc,Map<String,List<ReadFeatureAggregator>>>> flush(GenomeLoc userIntervalEnding) {
|
||||||
|
// jump in locations -- flush the windows
|
||||||
|
List<Pair<GenomeLoc,Map<String,List<ReadFeatureAggregator>>>> complete = new ArrayList<Pair<GenomeLoc,Map<String,List<ReadFeatureAggregator>>>>(aggregators.size());
|
||||||
|
// fill in any uncovered windows
|
||||||
|
GenomeLoc iStop = userIntervalEnding.getStopLocation();
|
||||||
|
//System.out.printf("Calling fill from within flush at %s%n",userIntervalEnding);
|
||||||
|
fill(iStop,userIntervalEnding);
|
||||||
|
// now expire all windows, terminating them either at the proper window size, or at the endpoint of the interval if it comes sooner
|
||||||
|
while ( windowStarts.size() > 0 ) {
|
||||||
|
Map<String,List<ReadFeatureAggregator>> cMap = aggregatorWindows.remove(0);
|
||||||
|
GenomeLoc wsLoc = windowStarts.remove(0);
|
||||||
|
|
||||||
|
GenomeLoc cLoc;
|
||||||
|
if ( wsLoc.distance(iStop) > argumentCollection.windowSize ) {
|
||||||
|
cLoc = parser.createGenomeLoc(wsLoc.getContig(),wsLoc.getStart(),wsLoc.getStart()+argumentCollection.windowSize);
|
||||||
|
} else {
|
||||||
|
cLoc = wsLoc.endpointSpan(iStop);
|
||||||
|
}
|
||||||
|
|
||||||
|
complete.add(new Pair<GenomeLoc,Map<String,List<ReadFeatureAggregator>>>(cLoc,cMap));
|
||||||
|
}
|
||||||
|
|
||||||
|
previousLoc = null; // will re-instantiate data upon next loc
|
||||||
|
|
||||||
|
return complete;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Workhorse method:
|
||||||
|
* - determines if new windows need be made
|
||||||
|
* - determines if old windows need be tested & trashed
|
||||||
|
* - determines which windows need to be updated & updates them
|
||||||
|
*
|
||||||
|
* @param loc - starting alignment position of the read
|
||||||
|
* @param sample - the sample ID from whom the read was sequenced
|
||||||
|
* @return - those feature window(s) that need be tested (and then baleeted)
|
||||||
|
*/
|
||||||
|
public List<Pair<GenomeLoc,Map<String,List<ReadFeatureAggregator>>>> inc(SAMRecord record, GenomeLoc loc, String sample, GenomeLoc userInterval) {
|
||||||
|
List<Pair<GenomeLoc,Map<String,List<ReadFeatureAggregator>>>> complete = new ArrayList<Pair<GenomeLoc,Map<String,List<ReadFeatureAggregator>>>>(aggregators.size());
|
||||||
|
if ( previousLoc == null ) {
|
||||||
|
// first time, gotta instantiate stuff
|
||||||
|
instantiate(loc,userInterval);
|
||||||
|
windowInc(sample,record,aggregatorWindows.get(0));
|
||||||
|
} else if ( loc.distance(previousLoc) == 0 ) {
|
||||||
|
// best and most common case: just update the living windows
|
||||||
|
for ( Map<String,List<ReadFeatureAggregator>> win : aggregatorWindows ) {
|
||||||
|
windowInc(sample,record,win);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// another standard case: we've gone to some further base in the same interval
|
||||||
|
while ( loc.distance(windowStarts.get(windowStarts.size()-1)) > argumentCollection.windowJump ) {
|
||||||
|
// careful, don't use the location itself, but add in windows every winJump bases until this condition is not met
|
||||||
|
//System.out.printf("Adding within inc at %s\t%s%n",loc,userInterval);
|
||||||
|
windowStarts.add(shiftLocByJump(windowStarts.get(windowStarts.size()-1),argumentCollection.windowJump));
|
||||||
|
aggregatorWindows.add(newWindow());
|
||||||
|
}
|
||||||
|
|
||||||
|
while ( windowStarts.size() > 0 && loc.distance(windowStarts.get(0)) > argumentCollection.windowSize ) {
|
||||||
|
Map<String,List<ReadFeatureAggregator>> cMap = aggregatorWindows.remove(0);
|
||||||
|
GenomeLoc wsLoc = windowStarts.remove(0);
|
||||||
|
GenomeLoc iStop = userInterval.getStopLocation();
|
||||||
|
GenomeLoc cLoc;
|
||||||
|
if ( wsLoc.distance(iStop) > argumentCollection.windowSize ) {
|
||||||
|
cLoc = parser.createGenomeLoc(wsLoc.getContig(),wsLoc.getStart(),wsLoc.getStart()+argumentCollection.windowSize);
|
||||||
|
} else {
|
||||||
|
cLoc = wsLoc.endpointSpan(iStop);
|
||||||
|
}
|
||||||
|
complete.add(new Pair<GenomeLoc,Map<String,List<ReadFeatureAggregator>>>(cLoc,cMap));
|
||||||
|
}
|
||||||
|
|
||||||
|
for ( Map<String,List<ReadFeatureAggregator>> win : aggregatorWindows ) {
|
||||||
|
windowInc(sample,record,win);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
previousLoc = loc;
|
||||||
|
|
||||||
|
return complete;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Incorporate new features into the window
|
||||||
|
* @param sample - id of sample from which the features come
|
||||||
|
* @param record - the read
|
||||||
|
* @param window - the particular window to be updated
|
||||||
|
*/
|
||||||
|
private void windowInc(String sample, SAMRecord record, Map<String,List<ReadFeatureAggregator>> window) {
|
||||||
|
if ( sample == null || record == null ) { return; }
|
||||||
|
|
||||||
|
for (ReadFeatureAggregator aggregator : window.get(sample) ) {
|
||||||
|
aggregator.aggregate(record);
|
||||||
|
}
|
||||||
|
|
||||||
|
for ( ReadFeatureAggregator aggregator : window.get( sampleCohortMap.get(sample) ? "case" : "control") ) {
|
||||||
|
aggregator.aggregate(record);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,227 @@
|
||||||
|
package org.broadinstitute.sting.oneoffprojects.walkers.newassociation;
|
||||||
|
|
||||||
|
import com.google.java.contract.Requires;
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features.ReadFeatureAggregator;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||||
|
import org.broadinstitute.sting.utils.collections.Pair;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: chartl
|
||||||
|
*/
|
||||||
|
public class RFWindow {
|
||||||
|
|
||||||
|
private RFAArgumentCollection argumentCollection; // the RF Argument Collection from the RF Walker
|
||||||
|
private List<GenomeLoc> windowStarts; // holds the starting positions of the windows
|
||||||
|
private Map<String,Boolean> sampleCohortMap; // identifies case samples ("true") and control samples ("false")
|
||||||
|
List<ReadFeatureAggregator> aggregators; // read feature aggregators to be used (maintained for fast cloning, and ordering)
|
||||||
|
// feature ---> sample ---> count
|
||||||
|
List<Map<String,List<ReadFeatureAggregator>>> aggregatorWindows; // holds the map between samples and features in the current windows
|
||||||
|
private GenomeLocParser parser; // the active parser, for creating GenomeLocs for empty windows
|
||||||
|
private GenomeLoc previousLoc; // the previous genome loc within the user interval given to the RFWindow
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Defines a new window which maps samples to read feature aggregators
|
||||||
|
* @return - a new element for aggregatorWindows
|
||||||
|
*/
|
||||||
|
private Map<String,List<ReadFeatureAggregator>> newWindow() {
|
||||||
|
Map<String,List<ReadFeatureAggregator>> win = new HashMap<String,List<ReadFeatureAggregator>>(sampleCohortMap.size());
|
||||||
|
for ( String s : sampleCohortMap.keySet() ) {
|
||||||
|
win.put(s,getAggregators());
|
||||||
|
}
|
||||||
|
// todo -- generalize me
|
||||||
|
win.put("case",getAggregators());
|
||||||
|
win.put("control",getAggregators());
|
||||||
|
|
||||||
|
return win;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generates a list of new aggregators to collect data
|
||||||
|
* @return list of ReadFeatureAggregators to be the value of a new window
|
||||||
|
*/
|
||||||
|
private List<ReadFeatureAggregator> getAggregators() {
|
||||||
|
ArrayList<ReadFeatureAggregator> newEmptyAgs = new ArrayList<ReadFeatureAggregator>(aggregators.size());
|
||||||
|
try {
|
||||||
|
for ( ReadFeatureAggregator ag : aggregators ) {
|
||||||
|
newEmptyAgs.add(ag.getClass().getConstructor(RFAArgumentCollection.class).newInstance(argumentCollection));
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new StingException("Error instantiating read feature aggregator",e);
|
||||||
|
}
|
||||||
|
|
||||||
|
return newEmptyAgs;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A constructor for the RFWindow object
|
||||||
|
* @param collection - the argument collection, from the walker
|
||||||
|
* @param cohortMap - the map between sample IDs and whether or not it is a case sample, from the walker
|
||||||
|
* @param parser - the Genome Loc Parser, from the walker
|
||||||
|
*/
|
||||||
|
public RFWindow(List<ReadFeatureAggregator> aggregators, RFAArgumentCollection collection, Map<String,Boolean> cohortMap, GenomeLocParser parser) {
|
||||||
|
this.argumentCollection = collection;
|
||||||
|
this.sampleCohortMap = cohortMap;
|
||||||
|
this.parser = parser;
|
||||||
|
this.aggregators = aggregators;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Instantiate the tiled windows of sample -> List<aggregator> maps, filling in empty windows up to the one which contains the
|
||||||
|
* provided genomeloc if necessary.
|
||||||
|
* @param loc - the location to fill up to, usually the starting position of a read or an interval
|
||||||
|
* @param currentUserInterval - the current user-provided interval being processed by the walker
|
||||||
|
*/
|
||||||
|
@Requires({"! currentUserInterval.isBefore(loc)"})
|
||||||
|
public void instantiate(GenomeLoc loc, GenomeLoc currentUserInterval) {
|
||||||
|
aggregatorWindows = new ArrayList<Map<String,List<ReadFeatureAggregator>>>(argumentCollection.windowSize/argumentCollection.windowJump);
|
||||||
|
windowStarts = new ArrayList<GenomeLoc>(argumentCollection.windowSize/argumentCollection.windowJump);
|
||||||
|
this.fill(loc, currentUserInterval);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fills the tiled window lists with empty windows up to the one which includes the locus
|
||||||
|
* todo -- this can take a lot of memory for large intervals instantiated far into them, e.g.
|
||||||
|
* |---------------------------------------------------------------------| interval
|
||||||
|
* . <=== locus
|
||||||
|
* todo -- perhaps a reduced representation for empty windows that were not created but already have expired?
|
||||||
|
* @param loc - the location to fill up to, usually the starting position of a read or an interval
|
||||||
|
* @param currentUserInterval - the current user-provided interval being processed by the walker
|
||||||
|
*/
|
||||||
|
@Requires({"! currentUserInterval.isBefore(loc)"})
|
||||||
|
public void fill(GenomeLoc loc, GenomeLoc currentUserInterval) {
|
||||||
|
// case 0: if the windows are empty add in the beginning of the interval
|
||||||
|
if ( windowStarts.size() == 0 ) {
|
||||||
|
windowStarts.add(currentUserInterval.getStartLocation());
|
||||||
|
aggregatorWindows.add(newWindow());
|
||||||
|
}
|
||||||
|
|
||||||
|
// case 1: loc is before or within windowJump bases of the current user interval; we need only instantiate the first window
|
||||||
|
if ( loc.isBefore(currentUserInterval) || loc.distance(currentUserInterval) <= argumentCollection.windowJump ) {
|
||||||
|
// do nothing at all
|
||||||
|
} else {
|
||||||
|
// case 2: loc is somewhere in the middle or at the end of current user interval, need to fill in windows up until then
|
||||||
|
GenomeLoc nextLoc = shiftLocByJump(windowStarts.get(windowStarts.size()-1),argumentCollection.windowJump);
|
||||||
|
while ( loc.distance(nextLoc) > argumentCollection.windowJump ) {
|
||||||
|
nextLoc = shiftLocByJump(windowStarts.get(windowStarts.size()-1),argumentCollection.windowJump);
|
||||||
|
windowStarts.add(nextLoc);
|
||||||
|
aggregatorWindows.add(newWindow());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Shifts a location from chrZ:X to chrZ:X+jump
|
||||||
|
* @param loc - location to shift
|
||||||
|
* @param jump - amount to shift by
|
||||||
|
* @return loc with start position shifted by jump
|
||||||
|
*/
|
||||||
|
@Requires("loc.getStart()+jump < parser.getContigInfo(loc.getContig()).getSequenceLength()") // e.g. that the new loc is not off the end of the contig
|
||||||
|
private GenomeLoc shiftLocByJump(GenomeLoc loc, int jump) {
|
||||||
|
return parser.createGenomeLoc(loc.getContig(),loc.getStart()+jump);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fills in missing windows between the previously-seen one and the ending interval, then expires all windows, returning
|
||||||
|
* them as complete, resetting previousLocation to null so that the next read re-instantiates internal window data
|
||||||
|
* @param userIntervalEnding - the user interval which is ending
|
||||||
|
* @return those currently active windows, plus empty interpolating windows between the last active window and the end of the interval
|
||||||
|
*/
|
||||||
|
public List<Pair<GenomeLoc,Map<String,List<ReadFeatureAggregator>>>> flush(GenomeLoc userIntervalEnding) {
|
||||||
|
// jump in locations -- flush the windows
|
||||||
|
List<Pair<GenomeLoc,Map<String,List<ReadFeatureAggregator>>>> complete = new ArrayList<Pair<GenomeLoc,Map<String,List<ReadFeatureAggregator>>>>(aggregators.size());
|
||||||
|
// fill in any uncovered windows
|
||||||
|
GenomeLoc iStop = userIntervalEnding.getStopLocation();
|
||||||
|
fill(iStop,userIntervalEnding);
|
||||||
|
// now expire all windows, terminating them either at the proper window size, or at the endpoint of the interval if it comes sooner
|
||||||
|
while ( windowStarts.size() > 0 ) {
|
||||||
|
Map<String,List<ReadFeatureAggregator>> cMap = aggregatorWindows.remove(0);
|
||||||
|
GenomeLoc wsLoc = windowStarts.remove(0);
|
||||||
|
|
||||||
|
GenomeLoc cLoc;
|
||||||
|
if ( wsLoc.distance(iStop) > argumentCollection.windowSize ) {
|
||||||
|
cLoc = parser.createGenomeLoc(wsLoc.getContig(),wsLoc.getStart(),wsLoc.getStart()+argumentCollection.windowSize);
|
||||||
|
} else {
|
||||||
|
cLoc = wsLoc.endpointSpan(iStop);
|
||||||
|
}
|
||||||
|
|
||||||
|
complete.add(new Pair<GenomeLoc,Map<String,List<ReadFeatureAggregator>>>(cLoc,cMap));
|
||||||
|
}
|
||||||
|
|
||||||
|
previousLoc = null; // will re-instantiate data upon next loc
|
||||||
|
|
||||||
|
return complete;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Workhorse method:
|
||||||
|
* - determines if new windows need be made
|
||||||
|
* - determines if old windows need be tested & trashed
|
||||||
|
* - determines which windows need to be updated & updates them
|
||||||
|
*
|
||||||
|
* @param loc - starting alignment position of the read
|
||||||
|
* @param sample - the sample ID from whom the read was sequenced
|
||||||
|
* @return - those feature window(s) that need be tested (and then baleeted)
|
||||||
|
*/
|
||||||
|
public List<Pair<GenomeLoc,Map<String,List<ReadFeatureAggregator>>>> inc(SAMRecord record, GenomeLoc loc, String sample, GenomeLoc userInterval) {
|
||||||
|
List<Pair<GenomeLoc,Map<String,List<ReadFeatureAggregator>>>> complete = new ArrayList<Pair<GenomeLoc,Map<String,List<ReadFeatureAggregator>>>>(aggregators.size());
|
||||||
|
if ( previousLoc == null ) {
|
||||||
|
// first time, gotta instantiate stuff
|
||||||
|
instantiate(loc,userInterval);
|
||||||
|
windowInc(sample,record,aggregatorWindows.get(0));
|
||||||
|
} else if ( loc.distance(previousLoc) == 0 ) {
|
||||||
|
// best and most common case: just update the living windows
|
||||||
|
for ( Map<String,List<ReadFeatureAggregator>> win : aggregatorWindows ) {
|
||||||
|
windowInc(sample,record,win);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// another standard case: we've gone to some further base in the same interval
|
||||||
|
while ( loc.distance(windowStarts.get(windowStarts.size()-1)) > argumentCollection.windowJump ) {
|
||||||
|
// careful, don't use the location itself, but add in windows every winJump bases until this condition is not met
|
||||||
|
windowStarts.add(shiftLocByJump(windowStarts.get(windowStarts.size()-1),argumentCollection.windowJump));
|
||||||
|
aggregatorWindows.add(newWindow());
|
||||||
|
}
|
||||||
|
|
||||||
|
while ( windowStarts.size() > 0 && loc.distance(windowStarts.get(0)) > argumentCollection.windowSize ) {
|
||||||
|
Map<String,List<ReadFeatureAggregator>> cMap = aggregatorWindows.remove(0);
|
||||||
|
GenomeLoc cLoc = windowStarts.remove(0).endpointSpan(previousLoc);
|
||||||
|
complete.add(new Pair<GenomeLoc,Map<String,List<ReadFeatureAggregator>>>(cLoc,cMap));
|
||||||
|
}
|
||||||
|
|
||||||
|
for ( Map<String,List<ReadFeatureAggregator>> win : aggregatorWindows ) {
|
||||||
|
windowInc(sample,record,win);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
previousLoc = loc;
|
||||||
|
|
||||||
|
return complete;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Incorporate new features into the window
|
||||||
|
* @param sample - id of sample from which the features come
|
||||||
|
* @param record - the read
|
||||||
|
* @param window - the particular window to be updated
|
||||||
|
*/
|
||||||
|
private void windowInc(String sample, SAMRecord record, Map<String,List<ReadFeatureAggregator>> window) {
|
||||||
|
if ( sample == null || record == null ) { return; }
|
||||||
|
|
||||||
|
for (ReadFeatureAggregator aggregator : window.get(sample) ) {
|
||||||
|
aggregator.aggregate(record);
|
||||||
|
}
|
||||||
|
|
||||||
|
for ( ReadFeatureAggregator aggregator : window.get( sampleCohortMap.get(sample) ? "case" : "control") ) {
|
||||||
|
aggregator.aggregate(record);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,152 @@
|
||||||
|
package org.broadinstitute.sting.oneoffprojects.walkers.newassociation;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features.ReadFeatureAggregator;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
|
import org.broadinstitute.sting.utils.collections.Pair;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Workhorse class of read feature association: maintains the active windows, culls the inactive ones and passes
|
||||||
|
* them back for testing. Windows consist of aggregators, and list indeces link windows to their starting loc which
|
||||||
|
* determines their active/inactive status.
|
||||||
|
*/
|
||||||
|
public class ReadFeatureWindow {
|
||||||
|
RFAArgumentCollection args;
|
||||||
|
private List<GenomeLoc> winStarts;
|
||||||
|
private GenomeLoc previousLoc;
|
||||||
|
private int winSize;
|
||||||
|
private int winJump;
|
||||||
|
private GenomeLocParser parser;
|
||||||
|
|
||||||
|
private Map<String,Boolean> sampleCohortMap;
|
||||||
|
List<ReadFeatureAggregator> aggregators;
|
||||||
|
// feature ---> sample ---> count
|
||||||
|
List<Map<String,List<ReadFeatureAggregator>>> aggregatorWindows;
|
||||||
|
|
||||||
|
public ReadFeatureWindow(List<ReadFeatureAggregator> aggregators, Map<String,Boolean> cohortMap, RFAArgumentCollection args, GenomeLocParser parser) {
|
||||||
|
winSize = args.windowSize;
|
||||||
|
winJump = args.windowJump;
|
||||||
|
sampleCohortMap = cohortMap;
|
||||||
|
this.args = args;
|
||||||
|
this.aggregators =aggregators;
|
||||||
|
this.parser = parser;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void instantiate(GenomeLoc loc) {
|
||||||
|
aggregatorWindows = new ArrayList<Map<String,List<ReadFeatureAggregator>>>(winSize/winJump);
|
||||||
|
winStarts = new ArrayList<GenomeLoc>(winSize/winJump);
|
||||||
|
winStarts.add(loc);
|
||||||
|
aggregatorWindows.add(newWindow());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Workhorse method:
|
||||||
|
* - determines if new windows need be made
|
||||||
|
* - determines if old windows need be tested & trashed
|
||||||
|
* - determines which windows need to be updated & updates them
|
||||||
|
*
|
||||||
|
* @param loc - starting alignment position of the read
|
||||||
|
* @param sample - the sample ID from whom the read was sequenced
|
||||||
|
* @return - those feature window(s) that need be tested (and then baleeted)
|
||||||
|
*/
|
||||||
|
public List<Pair<GenomeLoc,Map<String,List<ReadFeatureAggregator>>>> inc(SAMRecord record, GenomeLoc loc, String sample) {
|
||||||
|
List<Pair<GenomeLoc,Map<String,List<ReadFeatureAggregator>>>> complete = new ArrayList<Pair<GenomeLoc,Map<String,List<ReadFeatureAggregator>>>>(aggregators.size());
|
||||||
|
if ( previousLoc == null ) {
|
||||||
|
// first time, gotta instantiate stuff
|
||||||
|
instantiate(loc);
|
||||||
|
windowInc(sample,record,aggregatorWindows.get(0));
|
||||||
|
} else if ( loc.distance(previousLoc) == 0 ) {
|
||||||
|
// best and most common case: just update the living windows
|
||||||
|
for ( Map<String,List<ReadFeatureAggregator>> win : aggregatorWindows ) {
|
||||||
|
windowInc(sample,record,win);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// another standard case: we've gone to some further base in the same interval
|
||||||
|
while ( loc.distance(winStarts.get(winStarts.size()-1)) > winJump ) {
|
||||||
|
// careful, don't use the location itself, but add in windows every winJump bases until this condition is not met
|
||||||
|
winStarts.add(shiftLocByJump(winStarts.get(winStarts.size()-1),winJump));
|
||||||
|
aggregatorWindows.add(newWindow());
|
||||||
|
}
|
||||||
|
|
||||||
|
while ( winStarts.size() > 0 && loc.distance(winStarts.get(0)) > winSize ) {
|
||||||
|
Map<String,List<ReadFeatureAggregator>> cMap = aggregatorWindows.remove(0);
|
||||||
|
GenomeLoc cLoc = winStarts.remove(0).endpointSpan(previousLoc);
|
||||||
|
complete.add(new Pair<GenomeLoc,Map<String,List<ReadFeatureAggregator>>>(cLoc,cMap));
|
||||||
|
}
|
||||||
|
|
||||||
|
for ( Map<String,List<ReadFeatureAggregator>> win : aggregatorWindows ) {
|
||||||
|
windowInc(sample,record,win);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
previousLoc = loc;
|
||||||
|
|
||||||
|
return complete;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<Pair<GenomeLoc,Map<String,List<ReadFeatureAggregator>>>> flush() {
|
||||||
|
// jump in locations -- flush the windows
|
||||||
|
List<Pair<GenomeLoc,Map<String,List<ReadFeatureAggregator>>>> complete = new ArrayList<Pair<GenomeLoc,Map<String,List<ReadFeatureAggregator>>>>(aggregators.size());
|
||||||
|
while ( winStarts.size() > 0 ) {
|
||||||
|
Map<String,List<ReadFeatureAggregator>> cMap = aggregatorWindows.remove(0);
|
||||||
|
GenomeLoc cLoc = winStarts.remove(0).endpointSpan(previousLoc);
|
||||||
|
complete.add(new Pair<GenomeLoc,Map<String,List<ReadFeatureAggregator>>>(cLoc,cMap));
|
||||||
|
}
|
||||||
|
|
||||||
|
previousLoc = null; // will re-instantiate data upon next loc
|
||||||
|
|
||||||
|
return complete;
|
||||||
|
}
|
||||||
|
|
||||||
|
private Map<String,List<ReadFeatureAggregator>> newWindow() {
|
||||||
|
Map<String,List<ReadFeatureAggregator>> win = new HashMap<String,List<ReadFeatureAggregator>>(sampleCohortMap.size());
|
||||||
|
for ( String s : sampleCohortMap.keySet() ) {
|
||||||
|
win.put(s,getAggregators());
|
||||||
|
}
|
||||||
|
// todo -- generalize me
|
||||||
|
win.put("case",getAggregators());
|
||||||
|
win.put("control",getAggregators());
|
||||||
|
|
||||||
|
return win;
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<ReadFeatureAggregator> getAggregators() {
|
||||||
|
ArrayList<ReadFeatureAggregator> newEmptyAgs = new ArrayList<ReadFeatureAggregator>(aggregators.size());
|
||||||
|
try {
|
||||||
|
for ( ReadFeatureAggregator ag : aggregators ) {
|
||||||
|
newEmptyAgs.add(ag.getClass().getConstructor(RFAArgumentCollection.class).newInstance(args));
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new StingException("Error instantiating read feature aggregator",e);
|
||||||
|
}
|
||||||
|
|
||||||
|
return newEmptyAgs;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Incorporate new features into the window
|
||||||
|
* @param sample - id of sample from which the features come
|
||||||
|
* @param record - the read
|
||||||
|
* @param window - the particular window to be updated
|
||||||
|
*/
|
||||||
|
private void windowInc(String sample, SAMRecord record, Map<String,List<ReadFeatureAggregator>> window) {
|
||||||
|
if ( sample == null || record == null ) { return; }
|
||||||
|
|
||||||
|
for (ReadFeatureAggregator aggregator : window.get(sample) ) {
|
||||||
|
aggregator.aggregate(record);
|
||||||
|
}
|
||||||
|
|
||||||
|
for ( ReadFeatureAggregator aggregator : window.get( sampleCohortMap.get(sample) ? "case" : "control") ) {
|
||||||
|
aggregator.aggregate(record);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private GenomeLoc shiftLocByJump(GenomeLoc loc, int jump) {
|
||||||
|
return parser.createGenomeLoc(loc.getContig(),loc.getStart()+jump);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,31 @@
|
||||||
|
package org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.RFAArgumentCollection;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: chartl
|
||||||
|
* Date: 5/4/11
|
||||||
|
* Time: 1:09 PM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
public class AberrantInsertSize extends BinaryFeatureAggregator {
|
||||||
|
|
||||||
|
private int min;
|
||||||
|
private int max;
|
||||||
|
|
||||||
|
public AberrantInsertSize(RFAArgumentCollection col) {
|
||||||
|
super(col);
|
||||||
|
min = col.lowInsertSize;
|
||||||
|
max = col.highInsertSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Boolean extractFeature(SAMRecord rec) {
|
||||||
|
return Math.abs(rec.getInferredInsertSize()) > max || Math.abs(rec.getInferredInsertSize()) < min;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean featureDefined(SAMRecord rec) {
|
||||||
|
return rec.getReadPairedFlag() && rec.getProperPairFlag();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,48 @@
|
||||||
|
package org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features;
|
||||||
|
|
||||||
|
import net.sf.samtools.CigarElement;
|
||||||
|
import net.sf.samtools.CigarOperator;
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.RFAArgumentCollection;
|
||||||
|
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: Ghost
|
||||||
|
* Date: 5/19/11
|
||||||
|
* Time: 12:08 AM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
public class BinaryClippedBases extends BinaryFeatureAggregator {
|
||||||
|
|
||||||
|
private short baseLim;
|
||||||
|
private final byte baseQualLim = 20;
|
||||||
|
|
||||||
|
public Boolean extractFeature(SAMRecord read) {
|
||||||
|
int firstClippedToAliStart = read.getUnclippedStart()-read.getAlignmentStart();
|
||||||
|
int lastUnclippedToReadEnd = read.getUnclippedEnd()-read.getAlignmentEnd();
|
||||||
|
|
||||||
|
byte[] quals = read.getBaseQualities();
|
||||||
|
int nClipped = 0;
|
||||||
|
for ( int offset = 0; offset < firstClippedToAliStart; offset++ ) {
|
||||||
|
if ( quals[offset] >= baseQualLim ) {
|
||||||
|
nClipped++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for ( int offset = quals.length - lastUnclippedToReadEnd; offset < quals.length ; offset++ ) {
|
||||||
|
if ( quals[offset] >= baseQualLim ) {
|
||||||
|
nClipped ++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return nClipped >= baseLim;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean featureDefined(SAMRecord rec) { return ! rec.getReadPairedFlag() || Math.abs(rec.getInferredInsertSize()) > 100; } // unpaired or no adaptor sequence
|
||||||
|
|
||||||
|
public BinaryClippedBases(RFAArgumentCollection col) {
|
||||||
|
super(col);
|
||||||
|
baseLim = col.clippedBases;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,24 @@
|
||||||
|
package org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.RFAArgumentCollection;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: chartl
|
||||||
|
* Date: 5/4/11
|
||||||
|
* Time: 12:52 PM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
public abstract class BinaryFeatureAggregator extends ReadFeatureAggregator<Boolean> {
|
||||||
|
|
||||||
|
public BinaryFeatureAggregator(RFAArgumentCollection col) {
|
||||||
|
super(col);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void aggregate(Boolean hasFeature) {
|
||||||
|
// now robustified
|
||||||
|
mean = ( (hasFeature ? 1 : 0)+nReads*mean)/(++nReads);
|
||||||
|
var = mean*(1-mean) + Math.pow(2,1-nReads);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,24 @@
|
||||||
|
package org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.RFAArgumentCollection;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: chartl
|
||||||
|
* Date: 5/4/11
|
||||||
|
* Time: 12:52 PM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
public abstract class BinaryFeatureAggregator extends ReadFeatureAggregator<Boolean> {
|
||||||
|
|
||||||
|
public BinaryFeatureAggregator(RFAArgumentCollection col) {
|
||||||
|
super(col);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void aggregate(Boolean hasFeature) {
|
||||||
|
// now robustified
|
||||||
|
mean = ( 1+(hasFeature ? 1 : 0)+nReads*mean)/(++nReads+1);
|
||||||
|
var = mean*(1-mean);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,35 @@
|
||||||
|
package org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features;
|
||||||
|
|
||||||
|
import net.sf.samtools.CigarElement;
|
||||||
|
import net.sf.samtools.CigarOperator;
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.RFAArgumentCollection;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: chartl
|
||||||
|
* Date: 5/4/11
|
||||||
|
* Time: 1:33 PM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
public class ClippedBases {
|
||||||
|
// todo -- make a binary feature version of this
|
||||||
|
|
||||||
|
public Integer extractFeature(SAMRecord record) {
|
||||||
|
int nClipped = 0;
|
||||||
|
|
||||||
|
for ( CigarElement e : record.getCigar().getCigarElements() ) {
|
||||||
|
if ( e.getOperator().equals(CigarOperator.SOFT_CLIP) || e.getOperator().equals(CigarOperator.HARD_CLIP) ) {
|
||||||
|
nClipped += e.getLength();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return nClipped;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean featureDefined(SAMRecord rec) { return true; }
|
||||||
|
|
||||||
|
public ClippedBases(RFAArgumentCollection col) {
|
||||||
|
//super(col);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,27 @@
|
||||||
|
package org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.RFAArgumentCollection;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: chartl
|
||||||
|
* Date: 5/4/11
|
||||||
|
* Time: 12:58 PM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
public class InsertSize {
|
||||||
|
// todo -- this is deprecated by AIS, so the extension is removed.
|
||||||
|
|
||||||
|
public InsertSize(RFAArgumentCollection col) {
|
||||||
|
//super(col);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Integer extractFeature(SAMRecord record) {
|
||||||
|
return Math.abs(record.getInferredInsertSize());
|
||||||
|
}
|
||||||
|
|
||||||
|
protected boolean featureDefined(SAMRecord record) {
|
||||||
|
return record.getReadPairedFlag() && record.getProperPairFlag();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,26 @@
|
||||||
|
package org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.RFAArgumentCollection;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: chartl
|
||||||
|
* Date: 5/4/11
|
||||||
|
* Time: 1:06 PM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
public class MateOtherContig extends BinaryFeatureAggregator {
|
||||||
|
|
||||||
|
public MateOtherContig(RFAArgumentCollection col) {
|
||||||
|
super(col);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Boolean extractFeature(SAMRecord record) {
|
||||||
|
return ! record.getReferenceName().equals(record.getMateReferenceName());
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean featureDefined(SAMRecord read) {
|
||||||
|
return read.getReadPairedFlag() && ! read.getMateUnmappedFlag();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,27 @@
|
||||||
|
package org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.RFAArgumentCollection;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: chartl
|
||||||
|
* Date: 5/4/11
|
||||||
|
* Time: 1:15 PM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
public class MateSameStrand extends BinaryFeatureAggregator {
|
||||||
|
|
||||||
|
public Boolean extractFeature(SAMRecord record) {
|
||||||
|
return record.getReadNegativeStrandFlag() == record.getMateNegativeStrandFlag();
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean featureDefined(SAMRecord record) {
|
||||||
|
return record.getReadPairedFlag() && ! record.getMateUnmappedFlag() && record.getReferenceIndex().equals(record.getMateReferenceIndex());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public MateSameStrand(RFAArgumentCollection col) {
|
||||||
|
super(col);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,26 @@
|
||||||
|
package org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.RFAArgumentCollection;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: chartl
|
||||||
|
* Date: 5/4/11
|
||||||
|
* Time: 1:32 PM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
public class MateUnmapped extends BinaryFeatureAggregator {
|
||||||
|
|
||||||
|
public Boolean extractFeature(SAMRecord record) {
|
||||||
|
return record.getMateUnmappedFlag();
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean featureDefined(SAMRecord record) {
|
||||||
|
return record.getReadPairedFlag();
|
||||||
|
}
|
||||||
|
|
||||||
|
public MateUnmapped(RFAArgumentCollection col) {
|
||||||
|
super(col);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,50 @@
|
||||||
|
package org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.RFAArgumentCollection;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: chartl
|
||||||
|
* Date: 5/4/11
|
||||||
|
* Time: 12:39 PM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
public abstract class NumericFeatureAggregator extends ReadFeatureAggregator<Integer> {
|
||||||
|
|
||||||
|
private int min;
|
||||||
|
private int max;
|
||||||
|
|
||||||
|
public NumericFeatureAggregator(RFAArgumentCollection col) {
|
||||||
|
super(col);
|
||||||
|
min = -1;
|
||||||
|
max = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void aggregate(Integer datum) {
|
||||||
|
if ( min == -1 ) {
|
||||||
|
min = datum;
|
||||||
|
} else if ( max == -1 ) {
|
||||||
|
if ( datum > min ) {
|
||||||
|
max = datum;
|
||||||
|
} else {
|
||||||
|
max = min;
|
||||||
|
min = datum;
|
||||||
|
}
|
||||||
|
} else if ( datum > max ) {
|
||||||
|
update(max);
|
||||||
|
max = datum;
|
||||||
|
} else if ( datum < min ) {
|
||||||
|
update(min);
|
||||||
|
min = datum;
|
||||||
|
} else {
|
||||||
|
update(datum);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void update(Integer datum) {
|
||||||
|
double oldMean = mean;
|
||||||
|
mean += (datum - mean)/(1+nReads);
|
||||||
|
var = ((nReads*var) + (datum - oldMean)*(datum-mean))/++nReads;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,26 @@
|
||||||
|
package org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.RFAArgumentCollection;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: chartl
|
||||||
|
* Date: 6/8/11
|
||||||
|
* Time: 11:59 AM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
public class ProperPair extends BinaryFeatureAggregator {
|
||||||
|
|
||||||
|
public ProperPair(RFAArgumentCollection collection) {
|
||||||
|
super(collection);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Boolean extractFeature(SAMRecord record) {
|
||||||
|
return record.getProperPairFlag();
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean featureDefined(SAMRecord record) {
|
||||||
|
return record.getReadPairedFlag();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,61 @@
|
||||||
|
package org.broadinstitute.sting.oneoffprojects.walkers.newassociation.features;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.RFAArgumentCollection;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: chartl
|
||||||
|
* Date: 5/4/11
|
||||||
|
* Time: 12:33 PM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
public abstract class ReadFeatureAggregator<X> {
|
||||||
|
|
||||||
|
protected double mean;
|
||||||
|
protected double var;
|
||||||
|
protected int nReads;
|
||||||
|
|
||||||
|
public ReadFeatureAggregator(RFAArgumentCollection collection) {
|
||||||
|
init(collection);
|
||||||
|
mean = 0.0;
|
||||||
|
var = 0.0;
|
||||||
|
nReads = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void aggregate(SAMRecord record) {
|
||||||
|
if ( featureDefined(record) ) {
|
||||||
|
aggregate(extractFeature(record));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
protected abstract void aggregate(X feature);
|
||||||
|
|
||||||
|
protected abstract boolean featureDefined(SAMRecord record);
|
||||||
|
|
||||||
|
protected abstract X extractFeature(SAMRecord record);
|
||||||
|
|
||||||
|
public double getMean() { return mean; }
|
||||||
|
public double getVar() { return var; }
|
||||||
|
public double getUnbiasedVar() { return var*( (double) nReads)/(nReads-1); }
|
||||||
|
public int getnReads() { return nReads; }
|
||||||
|
|
||||||
|
public void init(RFAArgumentCollection collection) { }
|
||||||
|
|
||||||
|
public X parse(SAMRecord read) {
|
||||||
|
if ( featureDefined(read) ) {
|
||||||
|
return extractFeature(read);
|
||||||
|
} else {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public String parseStr(SAMRecord read) {
|
||||||
|
if ( featureDefined(read) ) {
|
||||||
|
return extractFeature(read).toString();
|
||||||
|
} else {
|
||||||
|
return "undefined";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,169 @@
|
||||||
|
package org.broadinstitute.sting.oneoffprojects.walkers.validation;
|
||||||
|
|
||||||
|
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||||
|
import org.broadinstitute.sting.commandline.Output;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.features.table.TableFeature;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.*;
|
||||||
|
import org.broadinstitute.sting.utils.BaseUtils;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
|
||||||
|
import java.io.PrintStream;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: chartl
|
||||||
|
* Date: 6/13/11
|
||||||
|
* Time: 2:12 PM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
@Requires(value={DataSource.REFERENCE}, referenceMetaData={@RMD(name="ProbeIntervals",type=TableFeature.class),
|
||||||
|
@RMD(name="ValidateAlleles",type=VariantContext.class),@RMD(name="MaskAlleles",type=VariantContext.class)})
|
||||||
|
public class PickSequenomProbes2 extends RodWalker<Integer,Integer> {
|
||||||
|
|
||||||
|
@Output
|
||||||
|
PrintStream out;
|
||||||
|
|
||||||
|
GenomeLoc prevInterval;
|
||||||
|
GenomeLoc allelePos;
|
||||||
|
String probeName;
|
||||||
|
StringBuilder sequence;
|
||||||
|
boolean sequenceInvalid;
|
||||||
|
|
||||||
|
public Integer reduceInit() {
|
||||||
|
prevInterval = null;
|
||||||
|
sequence = null;
|
||||||
|
sequenceInvalid = false;
|
||||||
|
probeName = null;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||||
|
if ( tracker == null || ! tracker.hasROD("ProbeIntervals")) { return null; }
|
||||||
|
|
||||||
|
GenomeLoc interval = ((TableFeature) tracker.getReferenceMetaData("ProbeIntervals",true).get(0)).getLocation();
|
||||||
|
if ( prevInterval == null || ! interval.equals(prevInterval) ) {
|
||||||
|
// we're in a new interval, we should:
|
||||||
|
// 1) print out previous data
|
||||||
|
// 2) reset internal data
|
||||||
|
// 3) instantiate traversal of this interval
|
||||||
|
|
||||||
|
// step 1:
|
||||||
|
if ( prevInterval != null ) {
|
||||||
|
// there was a previous interval
|
||||||
|
validateSequence(); // ensure the sequence in the region is valid
|
||||||
|
lowerRepeats(); // change repeats in sequence to lower case
|
||||||
|
print(); // print out the fasta sequence
|
||||||
|
}
|
||||||
|
|
||||||
|
// step 2:
|
||||||
|
prevInterval = interval;
|
||||||
|
allelePos = null;
|
||||||
|
sequence = new StringBuilder();
|
||||||
|
sequenceInvalid = false;
|
||||||
|
probeName = ((TableFeature) tracker.getReferenceMetaData("ProbeIntervals",true).get(0)).getAllValues().get(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// step 3 (or 1 if not new):
|
||||||
|
// build up the sequence
|
||||||
|
|
||||||
|
VariantContext mask = tracker.getVariantContext(ref,"MaskAlleles",ref.getLocus());
|
||||||
|
VariantContext validate = tracker.getVariantContext(ref,"ValidateAlleles",ref.getLocus());
|
||||||
|
|
||||||
|
if ( mask == null && validate == null ) {
|
||||||
|
sequence.append(Character.toUpperCase((char) ref.getBase()));
|
||||||
|
} else if ( validate != null ) {
|
||||||
|
// doesn't matter if there's a mask here too -- this is what we want to validate
|
||||||
|
sequence.append('[');
|
||||||
|
sequence.append(validate.getAlternateAllele(0).toString());
|
||||||
|
sequence.append('/');
|
||||||
|
sequence.append(validate.getReference().toString());
|
||||||
|
sequence.append(']');
|
||||||
|
allelePos = ref.getLocus();
|
||||||
|
} else /* (mask != null && validate == null ) */ {
|
||||||
|
if ( ! mask.isSNP() ) {
|
||||||
|
logger.warn("Mask Variant Context on the following warning line is not a SNP. Currently we can only mask out SNPs. This probe will not be designed.");
|
||||||
|
logger.warn(mask.toString());
|
||||||
|
sequenceInvalid = true;
|
||||||
|
sequence.append((char) ref.getBase());
|
||||||
|
} else {
|
||||||
|
sequence.append((char) BaseUtils.N);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer reduce(Integer i, Integer j) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void validateSequence() {
|
||||||
|
// code for ensuring primer sequence is valid goes here
|
||||||
|
|
||||||
|
// validate that there are no masked sites near to the variant site
|
||||||
|
String seq = sequence.toString();
|
||||||
|
int start = seq.indexOf('[') - 4;
|
||||||
|
int end = seq.indexOf(']') + 5;
|
||||||
|
boolean maskNearVariantSite = false;
|
||||||
|
for ( int i = start; i < end; i++ ) {
|
||||||
|
maskNearVariantSite |= (seq.charAt(i) == 'N');
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( maskNearVariantSite ) {
|
||||||
|
logger.warn("There is one (or more) mask variants within 4 basepair of the variant given for validation. This site will not be designed.");
|
||||||
|
sequenceInvalid = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( seq.indexOf("[") != seq.lastIndexOf("[") ) {
|
||||||
|
logger.warn("Multiple probe variants were found within this interval. Please fix the definitions of the intervals so they do not overlap.");
|
||||||
|
sequenceInvalid = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void lowerRepeats() {
|
||||||
|
// convert to lower case low-complexity repeats, e.g. tandem k-mers
|
||||||
|
final int K_LIM = 6;
|
||||||
|
String seq = sequence.toString();
|
||||||
|
StringBuilder newSequence = new StringBuilder();
|
||||||
|
int start_pos = 0;
|
||||||
|
while( start_pos < seq.length() ) {
|
||||||
|
boolean broke = false;
|
||||||
|
for ( int length = K_LIM; length > 1; length -- ) {
|
||||||
|
if ( equalsIgnoreNs(seq.substring(start_pos,start_pos+length),seq.substring(start_pos+length,start_pos+2*length)) ) {
|
||||||
|
newSequence.append(seq.substring(start_pos,start_pos+2*length).toLowerCase());
|
||||||
|
start_pos += length;
|
||||||
|
broke = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( ! broke ) {
|
||||||
|
newSequence.append(seq.substring(start_pos,start_pos+1));
|
||||||
|
start_pos++;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
sequence = newSequence;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean equalsIgnoreNs(String one, String two) {
|
||||||
|
if ( one.length() != two.length() ) { return false; }
|
||||||
|
for ( int idx = 0; idx < one.length(); idx++ ) {
|
||||||
|
if ( one.charAt(idx) != two.charAt(idx) ) {
|
||||||
|
if ( one.charAt(idx) != 'N' && two.charAt(idx) != 'N' ) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void print() {
|
||||||
|
out.printf("%s\t%s\t%s%n",allelePos.toString(),probeName,sequence.toString());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -72,6 +72,10 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Serializable, HasGenome
|
||||||
@Ensures("result != null")
|
@Ensures("result != null")
|
||||||
public final GenomeLoc getLocation() { return this; }
|
public final GenomeLoc getLocation() { return this; }
|
||||||
|
|
||||||
|
public final GenomeLoc getStartLocation() { return new GenomeLoc(getContig(),getContigIndex(),getStart(),getStart()); }
|
||||||
|
|
||||||
|
public final GenomeLoc getStopLocation() { return new GenomeLoc(getContig(),getContigIndex(),getStop(),getStop()); }
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @return the name of the contig of this GenomeLoc
|
* @return the name of the contig of this GenomeLoc
|
||||||
*/
|
*/
|
||||||
|
|
@ -142,6 +146,24 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Serializable, HasGenome
|
||||||
Math.max( getStop(), that.getStop()) );
|
Math.max( getStop(), that.getStop()) );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a new GenomeLoc that represents the region between the endpoints of this and that. Requires that
|
||||||
|
* this and that GenomeLoc are both mapped.
|
||||||
|
*/
|
||||||
|
@Requires({"that != null", "isUnmapped(this) == isUnmapped(that)"})
|
||||||
|
@Ensures("result != null")
|
||||||
|
public GenomeLoc endpointSpan(GenomeLoc that) throws ReviewedStingException {
|
||||||
|
if(GenomeLoc.isUnmapped(this) || GenomeLoc.isUnmapped(that)) {
|
||||||
|
throw new ReviewedStingException("Cannot get endpoint span for unmerged genome locs");
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( ! this.getContig().equals(that.getContig()) ) {
|
||||||
|
throw new ReviewedStingException("Cannot get endpoint span for genome locs on different contigs");
|
||||||
|
}
|
||||||
|
|
||||||
|
return new GenomeLoc(getContig(),this.contigIndex,Math.min(getStart(),that.getStart()),Math.max(getStop(),that.getStop()));
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Splits the contig into to regions: [start,split point) and [split point, end].
|
* Splits the contig into to regions: [start,split point) and [split point, end].
|
||||||
* @param splitPoint The point at which to split the contig. Must be contained in the given interval.
|
* @param splitPoint The point at which to split the contig. Must be contained in the given interval.
|
||||||
|
|
@ -330,6 +352,11 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Serializable, HasGenome
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Requires("that != null")
|
||||||
|
public boolean endsAt(GenomeLoc that) {
|
||||||
|
return (this.compareContigs(that) == 0) && ( this.getStop() == that.getStop() );
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* How many BPs are covered by this locus?
|
* How many BPs are covered by this locus?
|
||||||
* @return Number of BPs covered by this locus. According to the semantics of GenomeLoc, this should
|
* @return Number of BPs covered by this locus. According to the semantics of GenomeLoc, this should
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,219 @@
|
||||||
|
import org.broadinstitute.sting.commandline.ArgumentSource
|
||||||
|
import org.broadinstitute.sting.datasources.pipeline.Pipeline
|
||||||
|
import org.broadinstitute.sting.queue.extensions.gatk._
|
||||||
|
import org.broadinstitute.sting.queue.function.ListWriterFunction
|
||||||
|
import org.broadinstitute.sting.queue.function.scattergather.{GatherFunction, CloneFunction, ScatterFunction}
|
||||||
|
import org.broadinstitute.sting.queue.library.ipf.intervals.ExpandIntervals
|
||||||
|
import org.broadinstitute.sting.queue.QScript
|
||||||
|
import collection.JavaConversions._
|
||||||
|
import org.broadinstitute.sting.utils.text.XReadLines
|
||||||
|
|
||||||
|
class FullCallingPipeline extends QScript {
|
||||||
|
qscript =>
|
||||||
|
|
||||||
|
@Input(doc="path to GATK jar", shortName="G")
|
||||||
|
var gatkJar: File = _
|
||||||
|
|
||||||
|
@Input(doc="level of parallelism for UnifiedGenotyper (both for SNPs and indels). By default is set to 20.", shortName="varScatter", required=false)
|
||||||
|
var num_var_scatter_jobs = 20
|
||||||
|
|
||||||
|
@Argument(doc="expand each target in input intervals by the specified number of bases (50 bases by default)", shortName="expand", required=false)
|
||||||
|
var expandIntervals = 50
|
||||||
|
|
||||||
|
private var pipeline: Pipeline = _
|
||||||
|
|
||||||
|
private final val picardFixMatesClass = "net.sf.picard.sam.FixMateInformation"
|
||||||
|
|
||||||
|
|
||||||
|
val BAM_FILES : List[File] = (new XReadLines(new File("/humgen/gsa-hphome1/chartl/projects/oneoffs/VQSR_Exome/resources/broad.bam.list"))).readLines.map(u => new File(u)).toList
|
||||||
|
val DBSNP : File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_b37.vcf")
|
||||||
|
val REF : File = new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta")
|
||||||
|
val INTS : File = new File("/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list")
|
||||||
|
val BASE : String = "exon_vqsr"
|
||||||
|
|
||||||
|
val handFiltered : File = new File("/humgen/1kg/exomes/results/broad.wex.96samples/v1/1KGBroadWEx.variants.vcf")
|
||||||
|
|
||||||
|
|
||||||
|
trait CommandLineGATKArgs extends CommandLineGATK {
|
||||||
|
this.intervals :+= INTS
|
||||||
|
this.jarFile = qscript.gatkJar
|
||||||
|
this.reference_sequence = REF
|
||||||
|
this.memoryLimit = Some(4)
|
||||||
|
}
|
||||||
|
// ------------ SETUP THE PIPELINE ----------- //
|
||||||
|
|
||||||
|
|
||||||
|
def script = {
|
||||||
|
|
||||||
|
endToEnd(BASE,"cleaned")
|
||||||
|
}
|
||||||
|
|
||||||
|
def endToEnd(base: String, bamType: String) = {
|
||||||
|
val bamFiles = BAM_FILES
|
||||||
|
|
||||||
|
val ei : ExpandIntervals = new ExpandIntervals(INTS,1,qscript.expandIntervals, new File("Resources", base + ".flanks.interval_list"), REF, "INTERVALS", "INTERVALS")
|
||||||
|
ei.jobOutputFile = new File(".queue/logs/Overall/ExpandIntervals.out")
|
||||||
|
|
||||||
|
if (qscript.expandIntervals > 0) {
|
||||||
|
//add(ei)
|
||||||
|
}
|
||||||
|
|
||||||
|
trait ExpandedIntervals extends CommandLineGATK {
|
||||||
|
if (qscript.expandIntervals > 0) {
|
||||||
|
this.intervals :+= ei.outList
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Call indels
|
||||||
|
val indels = new UnifiedGenotyper with CommandLineGATKArgs with ExpandedIntervals
|
||||||
|
indels.analysisName = base + "_indels"
|
||||||
|
indels.jobOutputFile = new File(".queue/logs/IndelCalling/UnifiedGenotyper.indels.out")
|
||||||
|
indels.memoryLimit = Some(6)
|
||||||
|
indels.downsample_to_coverage = Some(600)
|
||||||
|
indels.genotype_likelihoods_model = org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.Model.DINDEL
|
||||||
|
indels.input_file = bamFiles
|
||||||
|
indels.rodBind :+= RodBind("dbsnp", "vcf", DBSNP)
|
||||||
|
indels.out = new File("IndelCalls", base+".indels.vcf")
|
||||||
|
|
||||||
|
indels.scatterCount = qscript.num_var_scatter_jobs
|
||||||
|
indels.setupScatterFunction = {
|
||||||
|
case scatter: ScatterFunction =>
|
||||||
|
scatter.commandDirectory = new File("IndelCalls/ScatterGather")
|
||||||
|
scatter.jobOutputFile = new File(".queue/logs/IndelCalling/ScatterGather/Scatter.out")
|
||||||
|
}
|
||||||
|
indels.setupCloneFunction = {
|
||||||
|
case (clone: CloneFunction, index: Int) =>
|
||||||
|
clone.commandDirectory = new File("IndelCalls/ScatterGather/Scatter_%s".format(index))
|
||||||
|
clone.jobOutputFile = new File(".queue/logs/IndelCalling/ScatterGather/Scatter_%s.out".format(index))
|
||||||
|
}
|
||||||
|
indels.setupGatherFunction = {
|
||||||
|
case (gather: GatherFunction, source: ArgumentSource) =>
|
||||||
|
gather.commandDirectory = new File("IndelCalls/ScatterGather/Gather_%s".format(source.field.getName))
|
||||||
|
gather.jobOutputFile = new File(".queue/logs/IndelCalling/ScatterGather/Gather_%s.out".format(source.field.getName))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Filter indels
|
||||||
|
val filteredIndels = new VariantFiltration with CommandLineGATKArgs with ExpandedIntervals
|
||||||
|
filteredIndels.analysisName = base + "_filteredIndels"
|
||||||
|
filteredIndels.jobOutputFile = new File(".queue/logs/IndelCalling/VariantFiltration.indels.out")
|
||||||
|
filteredIndels.filterName ++= List("IndelQUALFilter","IndelSBFilter","IndelQDFilter")
|
||||||
|
filteredIndels.filterExpression ++= List("\"QUAL<30.0\"","\"SB>-1.0\"","\"QD<2\"")
|
||||||
|
filteredIndels.variantVCF = indels.out
|
||||||
|
filteredIndels.out = swapExt("IndelCalls", indels.out, ".vcf",".filtered.vcf")
|
||||||
|
|
||||||
|
// Call snps
|
||||||
|
val snps = new UnifiedGenotyper with CommandLineGATKArgs with ExpandedIntervals
|
||||||
|
snps.analysisName = base+"_snps"
|
||||||
|
snps.jobOutputFile = new File(".queue/logs/SNPCalling/UnifiedGenotyper.snps.out")
|
||||||
|
snps.memoryLimit = Some(6)
|
||||||
|
snps.downsample_to_coverage = Some(600)
|
||||||
|
snps.input_file = bamFiles
|
||||||
|
snps.rodBind :+= RodBind("dbsnp", "vcf", DBSNP)
|
||||||
|
snps.out = new File("SnpCalls", base+".snps.vcf")
|
||||||
|
|
||||||
|
snps.scatterCount = qscript.num_var_scatter_jobs
|
||||||
|
snps.setupScatterFunction = {
|
||||||
|
case scatter: ScatterFunction =>
|
||||||
|
scatter.commandDirectory = new File("SnpCalls/ScatterGather")
|
||||||
|
scatter.jobOutputFile = new File(".queue/logs/SNPCalling/ScatterGather/Scatter.out")
|
||||||
|
}
|
||||||
|
snps.setupCloneFunction = {
|
||||||
|
case (clone: CloneFunction, index: Int) =>
|
||||||
|
clone.commandDirectory = new File("SnpCalls/ScatterGather/Scatter_%s".format(index))
|
||||||
|
clone.jobOutputFile = new File(".queue/logs/SNPCalling/ScatterGather/Scatter_%s.out".format(index))
|
||||||
|
}
|
||||||
|
snps.setupGatherFunction = {
|
||||||
|
case (gather: GatherFunction, source: ArgumentSource) =>
|
||||||
|
gather.commandDirectory = new File("SnpCalls/ScatterGather/Gather_%s".format(source.field.getName))
|
||||||
|
gather.jobOutputFile = new File(".queue/logs/SNPCalling/ScatterGather/Gather_%s.out".format(source.field.getName))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Filter snps at indels
|
||||||
|
val filteredSNPs = new VariantFiltration with CommandLineGATKArgs with ExpandedIntervals
|
||||||
|
filteredSNPs.analysisName = base+"_filteredSNPs"
|
||||||
|
filteredSNPs.jobOutputFile = new File(".queue/logs/SNPCalling/VariantFiltration.snps.out")
|
||||||
|
filteredSNPs.clusterWindowSize = Some(10)
|
||||||
|
filteredSNPs.clusterSize = Some(3)
|
||||||
|
filteredSNPs.rodBind :+= RodBind("mask", "VCF", filteredIndels.out)
|
||||||
|
filteredSNPs.variantVCF = snps.out
|
||||||
|
filteredSNPs.out = swapExt("SnpCalls",snps.out,".vcf",".filtered.vcf")
|
||||||
|
|
||||||
|
// Mako de Clusters
|
||||||
|
val cr = new ContrastiveRecalibrator with CommandLineGATKArgs with ExpandedIntervals
|
||||||
|
cr.rodBind :+= new RodBind("input","vcf",filteredSNPs.out)
|
||||||
|
cr.rodBind :+= new RodBind("dbsnp","vcf",DBSNP,"known=true,training=false,truth=false,prior=8.0")
|
||||||
|
cr.rodBind :+= new RodBind("hapmap","vcf", new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf"),"known=false,training=true,truth=true,prior=15.0")
|
||||||
|
cr.rodBind :+= new RodBind("omni","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/1212samples.b37.vcf"),"known=false,training=true,truth=true,prior=12.0")
|
||||||
|
cr.allPoly = true
|
||||||
|
cr.use_annotation ++= List("HaplotypeScore","SB","QD","HRun")
|
||||||
|
cr.tranches_file = new File(base+".tranche")
|
||||||
|
cr.recal_file = new File(base+".contrastive.recal.table")
|
||||||
|
cr.tranche ++= List("99.9","99.5","99.25","98.0","97.75","97.65","97.5","97.3","97.2","97.1","98.0","97.5","97.0","96.75","96.5","96.0","95.5","95.0","94.75","94.5","94.25","94.0",
|
||||||
|
"93.75","93.5","93.25","93.0","92.75","92.5","92.25","92.0","91.0","90.0")
|
||||||
|
cr.analysisName = base+"_ContrastiveRecalibrator"
|
||||||
|
cr.memoryLimit = Some(32)
|
||||||
|
cr.num_threads = Some(6)
|
||||||
|
|
||||||
|
|
||||||
|
// Apply the Recalibration
|
||||||
|
val ar = new ApplyRecalibration with CommandLineGATKArgs with ExpandedIntervals
|
||||||
|
ar.rodBind :+= new RodBind("input","vcf",filteredSNPs.out)
|
||||||
|
ar.tranches_file = cr.tranches_file
|
||||||
|
ar.recal_file = cr.recal_file
|
||||||
|
ar.ts_filter_level = Some(91.75)
|
||||||
|
ar.out = new File(base+"_contrastive_recal.91.75.vcf")
|
||||||
|
ar.memoryLimit = Some(6)
|
||||||
|
|
||||||
|
// Variant eval the standard region
|
||||||
|
val stdEval = new VariantEval with CommandLineGATKArgs
|
||||||
|
stdEval.analysisName = base+"_VariantEval"
|
||||||
|
stdEval.jobOutputFile = new File(".queue/logs/Overall/VariantEval.std.out")
|
||||||
|
stdEval.noST = true
|
||||||
|
stdEval.noEV = true
|
||||||
|
stdEval.evalModule ++= List("SimpleMetricsByAC", "TiTvVariantEvaluator", "CountVariants","GenotypeConcordance")
|
||||||
|
stdEval.stratificationModule ++= List("EvalRod", "CompRod", "Novelty","Sample")
|
||||||
|
stdEval.rodBind :+= RodBind("dbsnp", "vcf",DBSNP)
|
||||||
|
stdEval.rodBind :+= RodBind("evalContrastive", "VCF", ar.out)
|
||||||
|
stdEval.rodBind :+= RodBind("evalHandFilter","VCF",handFiltered)
|
||||||
|
stdEval.rodBind :+= RodBind("compHandFilter","VCF",handFiltered)
|
||||||
|
stdEval.rodBind :+= RodBind("compAxiom","VCF",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/affymetrix_axiom/Affymetrix_Axiom_DB_2010_v4_b37.noOmni.noHM3.vcf"))
|
||||||
|
stdEval.rodBind :+= RodBind("compOMNI","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/1212samples.b37.vcf"))
|
||||||
|
stdEval.out = swapExt(ar.out, ".vcf", ".eval")
|
||||||
|
stdEval.num_threads = Some(6)
|
||||||
|
|
||||||
|
// Variant eval the flanking region
|
||||||
|
val flanksEval = new VariantEval with CommandLineGATKArgs
|
||||||
|
flanksEval.analysisName = base+"_VariantEval"
|
||||||
|
flanksEval.jobOutputFile = new File(".queue/logs/Overall/VariantEval.flanks.out")
|
||||||
|
flanksEval.intervals = List(ei.outList)
|
||||||
|
flanksEval.noST = true
|
||||||
|
flanksEval.noEV = true
|
||||||
|
flanksEval.evalModule ++= List("SimpleMetricsByAC", "TiTvVariantEvaluator", "CountVariants","GenotypeConcordance")
|
||||||
|
flanksEval.stratificationModule ++= List("EvalRod", "CompRod", "Novelty","Sample")
|
||||||
|
flanksEval.rodBind :+= RodBind("dbsnp", "vcf",DBSNP)
|
||||||
|
flanksEval.rodBind :+= RodBind("evalContrastive", "VCF", ar.out)
|
||||||
|
flanksEval.rodBind :+= RodBind("evalHandFilter","VCF",handFiltered)
|
||||||
|
flanksEval.rodBind :+= RodBind("compHandFilter","VCF",handFiltered)
|
||||||
|
flanksEval.rodBind :+= RodBind("compAxiom","VCF",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/affymetrix_axiom/Affymetrix_Axiom_DB_2010_v4_b37.noOmni.noHM3.vcf"))
|
||||||
|
flanksEval.rodBind :+= RodBind("compOMNI","vcf",new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/1212samples.b37.vcf"))
|
||||||
|
flanksEval.out = swapExt(ar.out, ".vcf", ".flanks.eval")
|
||||||
|
flanksEval.num_threads = Some(6)
|
||||||
|
|
||||||
|
// Make the bam list
|
||||||
|
val listOfBams = new File("Resources", base +".BamFiles.list")
|
||||||
|
|
||||||
|
val writeBamList = new ListWriterFunction
|
||||||
|
writeBamList.analysisName = base + "_BamList"
|
||||||
|
writeBamList.jobOutputFile = new File(".queue/logs/Overall/WriteBamList.out")
|
||||||
|
writeBamList.inputFiles = bamFiles
|
||||||
|
writeBamList.listFile = listOfBams
|
||||||
|
|
||||||
|
//add(indels, filteredIndels, snps, filteredSNPs, stdEval, writeBamList,cr,ar)
|
||||||
|
add(ar,stdEval)
|
||||||
|
|
||||||
|
if (qscript.expandIntervals > 0) {
|
||||||
|
add(flanksEval)
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,148 @@
|
||||||
|
import net.sf.picard.reference.FastaSequenceFile
|
||||||
|
import org.broadinstitute.sting.datasources.pipeline.Pipeline
|
||||||
|
import org.broadinstitute.sting.gatk.DownsampleType
|
||||||
|
import org.broadinstitute.sting.queue.extensions.gatk._
|
||||||
|
import org.broadinstitute.sting.queue.extensions.picard.PicardBamJarFunction
|
||||||
|
import org.broadinstitute.sting.queue.extensions.samtools._
|
||||||
|
import org.broadinstitute.sting.queue.{QException, QScript}
|
||||||
|
import collection.JavaConversions._
|
||||||
|
import org.broadinstitute.sting.utils.yaml.YamlUtils
|
||||||
|
import org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType
|
||||||
|
|
||||||
|
class Phase1WholeGenome extends QScript {
|
||||||
|
qscript =>
|
||||||
|
|
||||||
|
@Input(doc="path to GATK jar", shortName="gatk", required=true)
|
||||||
|
var gatkJar: File = _
|
||||||
|
|
||||||
|
@Input(doc="the chromosome to process", shortName="chr", required=true)
|
||||||
|
var chr: Int = _
|
||||||
|
|
||||||
|
@Input(doc="output path", shortName="outputDir", required=true)
|
||||||
|
var outputDir: String = _
|
||||||
|
|
||||||
|
@Input(doc="path to tmp space for storing intermediate bam files", shortName="outputTmpDir", required=true)
|
||||||
|
var outputTmpDir: String = _
|
||||||
|
|
||||||
|
private val reference: File = new File("/humgen/1kg/reference/human_g1k_v37.fasta")
|
||||||
|
private val dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf")
|
||||||
|
private val dindelCalls: String = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/AFR+EUR+ASN+1KG.dindel_august_release_merged_pilot1.20110126.sites.vcf"
|
||||||
|
val chromosomeLength = List(249250621,243199373,198022430,191154276,180915260,171115067,159138663,146364022,141213431,135534747,135006516,133851895,115169878,107349540,102531392,90354753,81195210,78077248,59128983,63025520,48129895,51304566)
|
||||||
|
val populations = List("ASW","CEU","CHB","CHS","CLM","FIN","GBR","IBS","JPT","LWK","MXL","PUR","TSI","YRI") //,"PPN")
|
||||||
|
|
||||||
|
private var pipeline: Pipeline = _
|
||||||
|
|
||||||
|
trait CommandLineGATKArgs extends CommandLineGATK {
|
||||||
|
this.jarFile = qscript.gatkJar
|
||||||
|
this.reference_sequence = qscript.reference
|
||||||
|
this.memoryLimit = Some(3)
|
||||||
|
this.rodBind :+= RodBind("dbsnp", "VCF", qscript.dbSNP )
|
||||||
|
}
|
||||||
|
|
||||||
|
class AnalysisPanel(val baseName: String, val pops: List[String], val jobNumber: Int) {
|
||||||
|
val rawVCFsnps = new File(qscript.outputDir + "/calls/chr" + qscript.chr.toString + "/" + baseName + "/" + baseName + ".phase1.chr" + qscript.chr.toString + "." + jobNumber + ".raw.snps.vcf")
|
||||||
|
val rawVCFindels = new File(qscript.outputDir + "/calls/chr" + qscript.chr.toString + "/" + baseName + "/" + baseName + ".phase1.chr" + qscript.chr.toString + "." + jobNumber + ".raw.indels.vcf")
|
||||||
|
|
||||||
|
val callSnps = new UnifiedGenotyper with CommandLineGATKArgs
|
||||||
|
callSnps.out = rawVCFsnps
|
||||||
|
callSnps.dcov = Some( 50 )
|
||||||
|
callSnps.stand_call_conf = Some( 4.0 )
|
||||||
|
callSnps.stand_emit_conf = Some( 4.0 )
|
||||||
|
callSnps.baq = Some(org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.CALCULATE_AS_NECESSARY)
|
||||||
|
callSnps.jobName = qscript.outputTmpDir + "/calls/chr" + qscript.chr.toString + "/" +baseName + ".phase1.chr" + qscript.chr.toString + "." + jobNumber + ".raw.snps"
|
||||||
|
callSnps.exactCalculation = Some(org.broadinstitute.sting.gatk.walkers.genotyper.ExactAFCalculationModel.ExactCalculation.LINEAR_EXPERIMENTAL)
|
||||||
|
|
||||||
|
val callIndels = new UnifiedGenotyper with CommandLineGATKArgs
|
||||||
|
callIndels.out = rawVCFindels
|
||||||
|
callIndels.dcov = Some( 50 )
|
||||||
|
callIndels.stand_call_conf = Some( 10.0 )
|
||||||
|
callIndels.stand_emit_conf = Some( 10.0 )
|
||||||
|
callIndels.baq = Some(org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.OFF)
|
||||||
|
callIndels.glm = Some(org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.Model.DINDEL)
|
||||||
|
callIndels.minIndelCnt = Some(5)
|
||||||
|
callIndels.read_filter :+= "Platform454"
|
||||||
|
callIndels.jobName = qscript.outputTmpDir + "/calls/chr" + qscript.chr.toString + "/" +baseName + ".phase1.chr" + qscript.chr.toString + "." + jobNumber + ".raw.indels"
|
||||||
|
callIndels.exactCalculation = Some(org.broadinstitute.sting.gatk.walkers.genotyper.ExactAFCalculationModel.ExactCalculation.LINEAR_EXPERIMENTAL)
|
||||||
|
callIndels.abort_at_too_much_coverage = Some(4500)
|
||||||
|
}
|
||||||
|
|
||||||
|
def script = {
|
||||||
|
val basesPerJob: Int = 700000
|
||||||
|
val lastBase: Int = qscript.chromosomeLength(qscript.chr - 1)
|
||||||
|
var start: Int = 1
|
||||||
|
var stop: Int = start - 1 + basesPerJob
|
||||||
|
if( stop > lastBase ) { stop = lastBase }
|
||||||
|
var jobNumber: Int = 1
|
||||||
|
while( jobNumber < (lastBase.toFloat / basesPerJob.toFloat) + 1.0) {
|
||||||
|
callThisChunk("%d:%d-%d".format(qscript.chr, start, stop), jobNumber)
|
||||||
|
start += basesPerJob
|
||||||
|
stop += basesPerJob
|
||||||
|
if( stop > lastBase ) { stop = lastBase }
|
||||||
|
jobNumber += 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def callThisChunk(interval: String, jobNumber: Int) = {
|
||||||
|
|
||||||
|
val AFR = new AnalysisPanel("AFR", List("LWK","YRI","ASW","CLM","PUR"), jobNumber)
|
||||||
|
val AMR = new AnalysisPanel("AMR", List("MXL","CLM","PUR","ASW"), jobNumber)
|
||||||
|
val EUR = new AnalysisPanel("EUR", List("CEU","FIN","GBR","TSI","IBS","MXL","CLM","PUR","ASW"), jobNumber)
|
||||||
|
val ASN = new AnalysisPanel("ASN", List("CHB","CHS","JPT","MXL","CLM","PUR"), jobNumber)
|
||||||
|
//val PAA = new AnalysisPanel("PAA", List("PPN","YRI","CHB"), jobNumber)
|
||||||
|
val analysisPanels = List(AFR, ASN, AMR, EUR)
|
||||||
|
|
||||||
|
for( population <- qscript.populations ) {
|
||||||
|
val baseTmpName: String = qscript.outputTmpDir + "/calls/chr" + qscript.chr.toString + "/" + population + ".phase1.chr" + qscript.chr.toString + "." + jobNumber.toString + "."
|
||||||
|
val bamList: File = new File("/humgen/1kg/processing/allPopulations_wholeGenome_phase1_release/bam_lists/%s.list".format(population))
|
||||||
|
val targetIntervals: File = new File(baseTmpName + "target.intervals")
|
||||||
|
|
||||||
|
// 1.) Create cleaning targets
|
||||||
|
val target = new RealignerTargetCreator with CommandLineGATKArgs
|
||||||
|
target.memoryLimit = Some(3)
|
||||||
|
target.input_file :+= bamList
|
||||||
|
target.intervalsString :+= interval
|
||||||
|
target.out = targetIntervals
|
||||||
|
target.mismatchFraction = Some(0.0)
|
||||||
|
target.maxIntervalSize = Some(700)
|
||||||
|
target.rodBind :+= RodBind("indels1", "VCF", qscript.dindelCalls)
|
||||||
|
target.jobName = baseTmpName + "target"
|
||||||
|
target.isIntermediate = true
|
||||||
|
|
||||||
|
// 2.) Clean without SW
|
||||||
|
val clean = new IndelRealigner with CommandLineGATKArgs
|
||||||
|
val cleanedBam = new File(baseTmpName + "cleaned.bam")
|
||||||
|
clean.memoryLimit = Some(4)
|
||||||
|
clean.input_file :+= bamList
|
||||||
|
clean.intervalsString :+= interval
|
||||||
|
clean.targetIntervals = targetIntervals
|
||||||
|
clean.out = cleanedBam
|
||||||
|
clean.doNotUseSW = true
|
||||||
|
clean.baq = Some(org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.RECALCULATE)
|
||||||
|
clean.rodBind :+= RodBind("indels1", "VCF", qscript.dindelCalls)
|
||||||
|
clean.jobName = baseTmpName + "clean"
|
||||||
|
clean.isIntermediate = true
|
||||||
|
clean.compress = Some(0)
|
||||||
|
clean.index_output_bam_on_the_fly = Some(true)
|
||||||
|
clean.sortInCoordinateOrderEvenThoughItIsHighlyUnsafe = true
|
||||||
|
|
||||||
|
add(target, clean)
|
||||||
|
|
||||||
|
for( a <- analysisPanels ) {
|
||||||
|
for( p <- a.pops) {
|
||||||
|
if( p == population ) {
|
||||||
|
a.callSnps.input_file :+= cleanedBam
|
||||||
|
a.callIndels.input_file :+= cleanedBam
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for( a <- analysisPanels ) {
|
||||||
|
a.callSnps.intervalsString :+= interval
|
||||||
|
a.callIndels.intervalsString :+= interval
|
||||||
|
add(a.callSnps, a.callIndels)
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,67 @@
|
||||||
|
import org.broadinstitute.sting.commandline.ArgumentCollection
|
||||||
|
import org.broadinstitute.sting.gatk.CommandLineGATK
|
||||||
|
import org.broadinstitute.sting.oneoffprojects.walkers.newassociation.RFAArgumentCollection
|
||||||
|
import org.broadinstitute.sting.queue.extensions.gatk.{RodBind, RFCombine, RFExtractor}
|
||||||
|
import org.broadinstitute.sting.queue.QScript
|
||||||
|
import org.broadinstitute.sting.queue.util.IOUtils
|
||||||
|
|
||||||
|
class RFAPipeline extends QScript {
|
||||||
|
rfapipeline =>
|
||||||
|
|
||||||
|
@Argument(doc="bam list",shortName="bams") var bamList: File = _
|
||||||
|
@Argument(doc="sting dir",shortName="sting") var stingDir: String = _
|
||||||
|
@Argument(doc="reference file",shortName="ref") var ref: File = _
|
||||||
|
@Argument(doc="output base",shortName="o") var baseOut: String = _
|
||||||
|
@Argument(doc="interval list",shortName="L") var intervals: File = _
|
||||||
|
@ArgumentCollection var rfaArgs : RFAArgumentCollection = new RFAArgumentCollection()
|
||||||
|
@Argument(doc="Number of bams per text file output",shortName="br",required = false) var br = 20
|
||||||
|
|
||||||
|
def script = {
|
||||||
|
// step one, break the bam files up
|
||||||
|
var subBams : Iterator[List[File]] = extractFileEntries(bamList).toList.grouped(br)
|
||||||
|
|
||||||
|
trait ExtractorArgs extends RFExtractor {
|
||||||
|
this.reference_sequence = rfapipeline.ref
|
||||||
|
this.jarFile = new File(rfapipeline.stingDir+"/dist/GenomeAnalysisTK.jar")
|
||||||
|
this.intervals :+= rfapipeline.intervals
|
||||||
|
// copy the args into the extractor
|
||||||
|
this.windowJump = Some(rfaArgs.windowJump)
|
||||||
|
this.windowSize = Some(rfaArgs.windowSize)
|
||||||
|
this.fixedZ = Some(rfaArgs.fixedZ)
|
||||||
|
this.perSampleZ = Some(rfaArgs.sampleZThresh)
|
||||||
|
this.HighInsertSize= Some(rfaArgs.highInsertSize)
|
||||||
|
this.LowInsertSize = Some(rfaArgs.lowInsertSize)
|
||||||
|
this.clippedBases = Some(rfaArgs.clippedBases)
|
||||||
|
this.sampleEpsilon = Some(rfaArgs.EPSILON)
|
||||||
|
this.memoryLimit = Some(2)
|
||||||
|
}
|
||||||
|
|
||||||
|
val extract : List[RFExtractor] = subBams.zipWithIndex.map( u => {
|
||||||
|
var g = new RFExtractor with ExtractorArgs
|
||||||
|
g.input_file ++= u._1
|
||||||
|
g.out = new File("%s.%d.txt".format(rfapipeline.baseOut,u._2))
|
||||||
|
g
|
||||||
|
}).toList
|
||||||
|
|
||||||
|
addAll(extract)
|
||||||
|
|
||||||
|
trait CombineArgs extends RFCombine {
|
||||||
|
this.reference_sequence = rfapipeline.ref
|
||||||
|
this.jarFile = new File(rfapipeline.stingDir+"/dist/GenomeAnalysisTK.jar")
|
||||||
|
}
|
||||||
|
|
||||||
|
var combine : RFCombine = new RFCombine with CombineArgs
|
||||||
|
var idx : Int = 0
|
||||||
|
extract.foreach( ex => {
|
||||||
|
val name = "s%d".format(idx)
|
||||||
|
val exRB = new RodBind(name,"table",ex.out)
|
||||||
|
combine.rodBind :+= exRB
|
||||||
|
combine.memoryLimit = Some(6)
|
||||||
|
idx+=1;
|
||||||
|
})
|
||||||
|
|
||||||
|
combine.out = new File(baseOut)
|
||||||
|
|
||||||
|
add(combine)
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue