More stable reduced reads representation. Bug fixes throughout. No diffs by <1% of sites in an exome, and the majority of these differences are filtered out, or are obvious artifacts. UnitTests for BaseCounts. BaseCounts extended to handle indels, but not yet enabled in the consensus reads.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5939 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2011-06-03 20:11:31 +00:00
parent 80cbc1924b
commit cd293f145b
8 changed files with 384 additions and 331 deletions

View File

@ -1,8 +1,12 @@
package org.broadinstitute.sting.playground.gatk.walkers.reducereads;
import org.broadinstitute.sting.alignment.reference.bwt.Counts;
import org.broadinstitute.sting.utils.BaseUtils;
import com.google.java.contract.*;
import java.util.EnumMap;
import java.util.Map;
/*
* Copyright (c) 2009 The Broad Institute
*
@ -35,40 +39,49 @@ import com.google.java.contract.*;
* Time: 2:55 PM
*/
final class BaseCounts {
private final int counts[] = new int[4]; // todo -- fixme -- include - and I events
public final static BaseIndex MAX_BASE_INDEX_WITH_NO_COUNTS = BaseIndex.A;
public final static byte MAX_BASE_WITH_NO_COUNTS = MAX_BASE_INDEX_WITH_NO_COUNTS.getByte();
public void incr(byte base) {
int baseI = BaseUtils.simpleBaseToBaseIndex(base);
if ( baseI >= 0 ) // no Ns
counts[baseI]++;
private final Map<BaseIndex, Integer> counts = new EnumMap<BaseIndex, Integer>(BaseIndex.class); // todo -- fixme -- include - and I events
{
for ( BaseIndex i : BaseIndex.values() )
counts.put(i,0);
}
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1")
public void incr(byte base) {
BaseIndex i = BaseIndex.byteToBase(base);
if ( i != null ) // no Ns
counts.put(i, counts.get(i) + 1);
}
@Ensures("BaseUtils.isRegularBase(result)")
public byte baseWithMostCounts() {
return BaseUtils.baseIndexToSimpleBase(maxBaseIndex());
return maxBaseIndex().getByte();
}
@Ensures("result >= 0")
public int countOfMostCommonBase() {
return counts[maxBaseIndex()];
return counts.get(maxBaseIndex());
}
@Ensures("result >= 0")
public int totalCounts() {
public int totalCount() {
int sum = 0;
for ( int c : counts ) {
for ( int c : counts.values() ) {
sum += c;
}
return sum;
}
@Ensures("result >= 0 && result < counts.length")
private int maxBaseIndex() {
int maxI = 0;
for ( int i = 0; i < counts.length; i++) {
if ( counts[i] > counts[maxI] ) {
@Ensures({
"result != null",
"totalCount() != 0 || result == MAX_BASE_INDEX_WITH_NO_COUNTS"})
private BaseIndex maxBaseIndex() {
BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS;
for ( BaseIndex i : counts.keySet() ) {
if ( counts.get(i) > counts.get(maxI) ) {
maxI = i;
}
}
@ -78,9 +91,39 @@ final class BaseCounts {
@Ensures("result != null")
public String toString() {
StringBuilder b = new StringBuilder();
for ( int i = 0; i < counts.length; i++ ) {
b.append((char)BaseUtils.baseIndexToSimpleBase(i)).append("=").append(counts[i]).append(",");
for ( Map.Entry<BaseIndex,Integer> elt : counts.entrySet() ) {
b.append(elt.toString()).append("=").append(elt.getValue()).append(",");
}
return b.toString();
}
private enum BaseIndex {
A ( 'A', 0 ),
C ( 'C', 1 ),
G ( 'G', 2 ),
T ( 'T', 3 ),
D ( 'D', 4 ),
I ( 'I', 5 ); // insertion to the right of the base
final byte b;
final int index;
private BaseIndex(char base, int index) {
this.b = (byte)base;
this.index = index;
}
public byte getByte() { return b; }
public static final BaseIndex byteToBase(final byte base) {
switch (base) {
case 'A': return A;
case 'C': return C;
case 'G': return G;
case 'T': return T;
case 'D': return D;
case 'I': return I;
default: return null;
}
}
}
}

View File

@ -39,29 +39,29 @@ import java.util.Set;
*/
/**
* Created by IntelliJ IDEA.
* User: depristo
* Date: 4/8/11
* Time: 3:01 PM
*
*/
* Created by IntelliJ IDEA.
* User: depristo
* Date: 4/8/11
* Time: 3:01 PM
*
* Represents a single base site in the consensus calculation. A site corresponds to a place
* on the reference genome, or is a dummy site that is only used to calculate insertion statistics
*/
final class ConsensusSite {
final GenomeLoc loc;
final Set<PileupElement> overlappingReads = new HashSet<PileupElement>();
final int offset;
final int offset, position;
final BaseCounts counts = new BaseCounts();
ConsensusSpan.Type markedType = null;
public ConsensusSite(GenomeLoc loc, int offset) {
this.loc = loc;
public ConsensusSite(int position, int offset) {
this.position = position;
this.offset = offset;
}
public GenomeLoc getLoc() {
return loc;
public int getPosition() {
return position;
}
public Set<PileupElement> getOverlappingReads() {
@ -75,7 +75,7 @@ final class ConsensusSite {
public boolean isStrongConsensus(final double maxFractionDisagreeingBases) {
int mostCommon = counts.countOfMostCommonBase();
int total = counts.totalCounts();
int total = counts.totalCount();
double fractionCommonBase = (1.0 * mostCommon) / total;
return (1 - fractionCommonBase) < maxFractionDisagreeingBases;
}

View File

@ -1,287 +0,0 @@
package org.broadinstitute.sting.playground.gatk.walkers.reducereads;
import net.sf.picard.sam.SamPairUtil;
import net.sf.samtools.*;
import org.apache.log4j.Logger;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.io.File;
import java.util.*;
//import org.broadinstitute.sting.utils.SimpleTimer;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
*
* @author depristo
* @version 0.1
*/
public class ReducingSAMFileWriter implements SAMFileWriter {
protected static final Logger logger = Logger.getLogger(ReducingSAMFileWriter.class);
private static final boolean DEBUG = false;
private static final boolean INVERT = false;
private static final boolean PRINT_CONSENSUS_READS = false;
/** The place where we ultimately write out our records */
final SAMFileWriter finalDestination;
Queue<SAMRecord> waitingReads = new LinkedList<SAMRecord>();
LinkedList<SAMRecord> consensusReads = new LinkedList<SAMRecord>();
final int IndelContextSize;
final int SNPContextSize;
Set<VariantContext> VCs = new HashSet<VariantContext>();
public long getNCompressedReads() {
return nCompressedReads;
}
long nCompressedReads = 0;
/**
*
* @param header
* @param outputFile
* @param compressionLevel
*/
public ReducingSAMFileWriter(final SAMFileHeader header,
final File outputFile,
final int compressionLevel,
final int SNPContextSize,
final int IndelContextSize) {
this(new SAMFileWriterFactory().makeBAMWriter(header, true, outputFile, compressionLevel),
SNPContextSize,
IndelContextSize);
}
public ReducingSAMFileWriter(final SAMFileWriter finalDestination,
final int SNPContextSize,
final int IndelContextSize) {
this.finalDestination = finalDestination;
this.IndelContextSize = IndelContextSize;
this.SNPContextSize = SNPContextSize;
}
public int getIndelContextSize() {
return IndelContextSize;
}
public int getSNPContextSize() {
return SNPContextSize;
}
/**
* Retrieves the header to use when creating the new SAM file.
* @return header to use when creating the new SAM file.
*/
public SAMFileHeader getFileHeader() {
return finalDestination.getFileHeader();
}
/**
* @{inheritDoc}
*/
public void addAlignment( SAMRecord newRead ) {
if ( DEBUG ) logger.info("New read pos " + newRead.getAlignmentStart() + " OP = " + newRead.getAttribute("OP"));
// if the new read is on a different contig, then we need to flush the queue and clear the map
if ( waitingReads.size() > 0 && waitingReads.peek().getReferenceIndex() != newRead.getReferenceIndex()) {
if ( DEBUG ) logger.warn("Flushing queue on move to new contig: " + newRead.getReferenceName());
// TODO -- fixme
while ( ! waitingReads.isEmpty() ) {
// emit to disk
finalDestination.addAlignment(waitingReads.remove());
}
}
waitingReads.add(newRead);
emitReadsIfPossible(newRead.getAlignmentStart());
}
public void addVariant(VariantContext vc) {
VCs.add(vc);
}
private void emitReadsIfPossible(int alignmentStartOfLastRead) {
//
// 2 states:
// -- reads ending << vc.getLocation()
// -- read overlapping vc.getLocation() +/- context size
//
while ( ! waitingReads.isEmpty() ) { // there's something in the queue
SAMRecord read = waitingReads.peek();
if ( ! withinContextOfVariants(read, VCs) ) {
addToConsensus(read);
waitingReads.remove();
} else if ( cannotOverlapFutureVC(read, alignmentStartOfLastRead) ) {
emitConsensus();
if ( ! INVERT ) finalDestination.addAlignment(read);
waitingReads.remove();
} else {
return;
}
}
}
private void addToConsensus(SAMRecord read) {
if ( ! read.getDuplicateReadFlag() && ! read.getNotPrimaryAlignmentFlag() && ! read.getReadUnmappedFlag() )
consensusReads.add(read);
}
private void emitConsensus() {
if ( ! consensusReads.isEmpty() ) {
SAMRecord firstRead = consensusReads.peek();
int start = firstRead.getAlignmentStart();
int end = furtherestEnd(consensusReads);
int len = end - start + 1;
int[][] baseCounts = new int[len][4];
for ( SAMRecord read : consensusReads ) {
int readI = 0, refI = read.getAlignmentStart() - start;
for ( CigarElement elt : read.getCigar().getCigarElements() ) {
int l = elt.getLength();
switch (elt.getOperator()) {
case N: // cannot handle these
break;
case H : case P : // ignore pads and hard clips
break;
case S : refI += l; // move the reference too, in addition to I
case I :
// todo - handle insertions?
readI += l;
break;
case D :
refI += l;
break;
case M :
while (readI < l) {
byte base = read.getReadBases()[readI++];
int baseI = BaseUtils.simpleBaseToBaseIndex(base);
if ( baseI >= 0 ) // no Ns
baseCounts[refI][baseI]++;
refI++;
}
break;
default:
throw new ReviewedStingException("BUG: Unexpected CIGAR element " + elt + " in read " + read.getReadName());
}
}
}
byte[] bases = new byte[len];
byte[] quals = new byte[len];
for ( int i = 0; i < len; i++) {
final int maxI = maxBaseCount(baseCounts[i]);
final int count = baseCounts[i][maxI];
final byte base = count == 0 ? (byte)'N' : BaseUtils.baseIndexToSimpleBase(maxI);
bases[i] = base;
quals[i] = QualityUtils.boundQual(count, (byte)64);
}
SAMRecord consensus = new SAMRecord(firstRead.getHeader());
consensus.setReferenceIndex(firstRead.getReferenceIndex());
consensus.setReadName("Mark");
consensus.setCigarString(String.format("%dM", len));
consensus.setReadPairedFlag(false);
consensus.setAlignmentStart(start);
consensus.setReadBases(bases);
consensus.setBaseQualities(quals);
consensus.setMappingQuality(60);
int nConsensus = consensusReads.size();
nCompressedReads += nConsensus;
logger.info(String.format("Compressing %5d reads into a single consensus", nConsensus));
finalDestination.addAlignment(consensus);
if ( INVERT && PRINT_CONSENSUS_READS )
for ( SAMRecord read : consensusReads )
finalDestination.addAlignment(read);
consensusReads.clear();
}
}
private static int furtherestEnd(Collection<SAMRecord> reads) {
int end = -1;
for ( SAMRecord read : reads ) {
end = Math.max(end, read.getAlignmentEnd());
}
return end;
}
private int maxBaseCount(int[] counts) {
int maxI = 0;
for ( int i = 0; i < counts.length; i++) {
if ( counts[i] > counts[maxI] ) {
maxI = i;
}
}
return maxI;
}
private boolean cannotOverlapFutureVC(SAMRecord read, int alignmentStartOfLastRead) {
return read.getAlignmentEnd() < alignmentStartOfLastRead - Math.max(SNPContextSize, IndelContextSize);
}
private boolean withinContextOfVariants(SAMRecord read, Collection<VariantContext> vcs) {
for ( VariantContext vc : vcs ) {
if ( withinContextOfVariant(read, vc) ) {
return true;
}
}
return false;
}
private boolean withinContextOfVariant(SAMRecord read, VariantContext vc) {
if ( ! read.getReferenceName().equals(vc.getChr()) )
return false;
else if ( vc.isVariant() ) {
int contextSize = vc.isSNP() ? SNPContextSize : IndelContextSize;
int vcContextStart = vc.getStart() - contextSize;
int vcContextEnd = vc.getEnd() + contextSize;
boolean notInContext = read.getAlignmentEnd() < vcContextStart || read.getAlignmentStart() > vcContextEnd;
return ! notInContext;
} else {
return false;
}
}
/**
* @{inheritDoc}
*/
public void close() {
// write out all of the remaining reads
while ( ! waitingReads.isEmpty() ) { // there's something in the queue
finalDestination.addAlignment(waitingReads.remove());
}
finalDestination.close();
}
}

View File

@ -65,6 +65,8 @@ public class RefPileupElement extends PileupElement {
public Iterator<RefPileupElement> iterator() {
List<RefPileupElement> elts = new ArrayList<RefPileupElement>();
// todo -- need to be ++X not X++ operators. The refI should go from -1, for reads like 2I2M,
// todo -- so that we can represent insertions to the left of the read
int readI = 0, refI = read.getAlignmentStart() - refIStart;
for ( CigarElement elt : read.getCigar().getCigarElements() ) {
int l = elt.getLength();
@ -80,7 +82,9 @@ public class RefPileupElement extends PileupElement {
case I :
for ( int i = 0; i < l; i++)
if ( refI >= 0 )
elts.add(new RefPileupElement(read, readI++, refI));
readI++;
// todo -- replace when insertions handled correctly
// elts.add(new RefPileupElement(read, readI++, refI));
break;
case D :
for ( int i = 0; i < l; i++)

View File

@ -3,6 +3,7 @@ package org.broadinstitute.sting.playground.gatk.walkers.reducereads;
import net.sf.samtools.*;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.QualityUtils;
@ -264,7 +265,7 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres
}
private ConsensusSpan findSpan(List<ConsensusSite> sites, int start, ConsensusSpan.Type consensusType) {
int refStart = sites.get(0).getLoc().getStart();
int refStart = sites.get(0).getPosition();
for ( int end = start + 1; end < sites.size(); end++ ) {
ConsensusSite site = sites.get(end);
@ -282,6 +283,22 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres
private List<ConsensusSite> calculateConsensusSites(Collection<SAMRecord> reads, boolean useAllRemainingReads, GenomeLoc lastProcessedRegion) {
List<ConsensusSite> consensusSites = createEmptyConsensusSites(reads, lastProcessedRegion);
int refStart = consensusSites.get(0).getPosition();
for ( SAMRecord read : reads ) {
for ( RefPileupElement p : RefPileupElement.walkRead(read, refStart) ) {
// add to the consensus at this site
if ( p.getRefOffset() >= consensusSites.size() )
throw new ReviewedStingException("BUG: ref offset off the consensus site list: " + p.getRead() + " at " + p.getRefOffset());
consensusSites.get(p.getRefOffset()).addOverlappingRead(p);
}
}
return consensusSites;
}
private static List<ConsensusSite> createEmptyConsensusSites(Collection<SAMRecord> reads, GenomeLoc lastProcessedRegion) {
SAMRecord firstRead = reads.iterator().next();
int minStart = lastProcessedRegion == null ? -1 : lastProcessedRegion.getStop() + 1;
@ -293,16 +310,9 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres
List<ConsensusSite> consensusSites = new ArrayList<ConsensusSite>();
int len = refEnd - refStart + 1;
for ( int i = 0; i < len; i++ ) {
int l = refStart + i;
GenomeLoc loc = glParser.createGenomeLoc(contig, l, l);
consensusSites.add(new ConsensusSite(loc, i));
}
for ( SAMRecord read : reads ) {
for ( RefPileupElement p : RefPileupElement.walkRead(read, refStart) ) {
// add to the consensus at this site
consensusSites.get(p.getRefOffset()).addOverlappingRead(p);
}
int position = refStart + i;
//GenomeLoc loc = glParser.createGenomeLoc(contig, l, l);
consensusSites.add(new ConsensusSite(position, i));
}
return consensusSites;
@ -345,7 +355,13 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres
if ( site.getMarkedType() == ConsensusSpan.Type.VARIABLE )
throw new ReviewedStingException("Variable site included in consensus: " + site);
final int count = site.counts.countOfMostCommonBase();
final byte base = count == 0 ? (byte)'N' : site.counts.baseWithMostCounts();
byte base = count == 0 ? (byte)'N' : site.counts.baseWithMostCounts();
if ( !BaseUtils.isRegularBase(base) ) {
// todo -- this code needs to be replaced with cigar building code as well
logger.warn("Substituting N for non-regular consensus base " + (char)base);
base = (byte)'N';
}
bases[i] = base;
quals[i] = QualityUtils.boundQual(count, (byte)64);
}

View File

@ -0,0 +1,73 @@
// our package
package org.broadinstitute.sting.playground.gatk.walkers.reducereads;
// the imports for unit testing.
import net.sf.samtools.SAMRecord;
import org.apache.commons.lang.StringUtils;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.ArrayList;
import java.util.List;
/**
* Basic unit test for BaseCounts in reduced reads
*/
public class BaseCountsUnitTest extends BaseTest {
private class SingleTest {
public String bases;
public byte mostCountBase;
public int mostCommonCount;
private SingleTest(String bases, char mostCountBase, int mostCommonCount) {
this.mostCommonCount = mostCommonCount;
this.mostCountBase = (byte)mostCountBase;
this.bases = bases;
}
}
@DataProvider(name = "data")
public Object[][] createData1() {
List<SingleTest> params = new ArrayList<SingleTest>();
params.add(new SingleTest("A", 'A', 1 ));
params.add(new SingleTest("AA", 'A', 2 ));
params.add(new SingleTest("AC", 'A', 1 ));
params.add(new SingleTest("AAC", 'A', 2 ));
params.add(new SingleTest("AAA", 'A', 3 ));
params.add(new SingleTest("AAAN", 'A', 3 ));
params.add(new SingleTest("AAANNNN", 'A', 3 ));
params.add(new SingleTest("AACTG", 'A', 2 ));
params.add(new SingleTest("D", 'D', 1 ));
params.add(new SingleTest("DDAAD", 'D', 3));
params.add(new SingleTest("", (char)BaseCounts.MAX_BASE_WITH_NO_COUNTS, 0 ));
params.add(new SingleTest("AAIIIAI", 'I', 4 ));
List<Object[]> params2 = new ArrayList<Object[]>();
for ( SingleTest x : params ) params2.add(new Object[]{x});
return params2.toArray(new Object[][]{});
}
@Test(dataProvider = "data", enabled = true)
public void testCounting(SingleTest params) {
BaseCounts counts = new BaseCounts();
for ( byte base : params.bases.getBytes() )
counts.incr(base);
String name = String.format("Test-%s", params.bases);
Assert.assertEquals(counts.totalCount(), params.bases.length() - StringUtils.countMatches(params.bases, "N"), name);
Assert.assertEquals(counts.countOfMostCommonBase(), params.mostCommonCount, name);
Assert.assertEquals((char)counts.baseWithMostCounts(), (char)params.mostCountBase, name);
}
}

View File

@ -0,0 +1,81 @@
package oneoffs.depristo
import org.broadinstitute.sting.queue.extensions.gatk._
import org.broadinstitute.sting.queue.QScript
import org.broadinstitute.sting.utils.clipreads.ClippingRepresentation
import scala.io.Source._
import org.broadinstitute.sting.queue.function.JavaCommandLineFunction
class PrepareBamsForHomogeneityTesting extends QScript {
qscript =>
@Argument(doc="Path to GATK jar",required=false,shortName="gatkjarfile") var gatkJarFile: File = new File("dist/GenomeAnalysisTK.jar")
@Argument(doc="Path to SamToFastq jar",required=false,shortName="SamToFastqjarfile") var SamToFastqJar: File = new File("/seq/software/picard/current/bin/SamToFastq.jar")
@Argument(doc="BAM File",required=true, shortName="bam") var bams: File = null
@Argument(doc="Bundle path",required=false, shortName="bundle") var bundle: File = new File("/humgen/gsa-hpprojects/GATK/bundle/current/b37/")
// @Argument(shortName = "R", doc="ref", required=true)
// var referenceFile: File = bundle + "/human_g1k_v37.fasta"
@Argument(shortName = "L", doc="intervals", required=false)
val TARGET_INTERVAL: String = null;
trait GATKArgs extends CommandLineGATK {
this.logging_level = "INFO";
this.jarFile = gatkJarFile;
if ( TARGET_INTERVAL != null )
this.intervalsString = List(TARGET_INTERVAL);
// this.reference_sequence = referenceFile;
this.memoryLimit = 2
}
// class ClipBAM(@Input in: File, @Output out: File) extends ClipReadsWalker with GATKArgs {
// this.o = out
// this.CT = "1-25"
// this.CR = ClippingRepresentation.HARDCLIP_BASES
// this.OQ = true
// }
//
// case class revert (@Input inBam: File, @Output outBam: File) extends PicardBamFunction {
// @Output(doc="reverted bam index") var revertedBamIndex = new File(outBam + ".bai")
// override def inputBams = List(inBAM)
// override def outputBam = outBam
// override def commandLine = super.commandLine + " CREATE_INDEX=true "
// this.isIntermediate = true
// this.jarFile = qscript.dedupJar
// this.analysisName = queueLogDir + outBam + ".dedup"
// this.jobName = queueLogDir + outBam + ".dedup"
// }
case class SamToFastq (@Input inBam: File, @Output fastq1: File, @Output fastq2: File, trim: Int) extends JavaCommandLineFunction {
this.jarFile = qscript.SamToFastqJar
override def commandLine = super.commandLine +
Array(
" INPUT=" + inBam,
" FASTQ=" + fastq1,
" VALIDATION_STRINGENCY=SILENT",
" SECOND_END_FASTQ=" + fastq2,
" READ1_TRIM=" + trim,
" READ2_TRIM=" + trim).mkString
//this.analysisName = queueLogDir + outBam + ".dedup"
//this.jobName = queueLogDir + outBam + ".dedup"
}
def createListFromFile(in: File):List[File] = {
if (in.toString.endsWith("bam"))
return List(in)
var l: List[File] = List()
for (bam <- fromFile(in).getLines)
l :+= new File(bam)
return l
}
def script = {
for ( bam <- createListFromFile(bams) ) {
val fastq1 = swapExt(bam, ".bam", ".1.fastq")
val fastq2 = swapExt(bam, ".bam", ".2.fastq")
add(new SamToFastq(bam, fastq1, fastq2, 25))
}
}
}

View File

@ -0,0 +1,123 @@
package oneoffs.depristo
import org.broadinstitute.sting.queue.QScript
import org.broadinstitute.sting.queue.extensions.samtools.SamtoolsIndexFunction
import org.broadinstitute.sting.queue.extensions.gatk._
import org.broadinstitute.sting.queue.function.JavaCommandLineFunction
class ReducedBAMEvaluation extends QScript {
@Argument(doc="gatkJarFile", required=false)
var gatkJarFile: File = new File("/home/radon01/depristo/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar")
@Argument(shortName = "R", doc="ref", required=false)
var referenceFile: File = new File("/humgen/1kg/reference/human_g1k_v37.fasta")
@Argument(shortName = "bam", doc="BAM", required=true)
val bam: File = null;
@Argument(shortName = "reduceIntervals", doc="intervals", required=false)
val REDUCE_INTERVAL: String = null;
@Argument(shortName = "callingIntervals", doc="intervals", required=false)
val CALLING_INTERVAL: String = null;
@Argument(shortName = "dcov", doc="dcov", required=false)
val DCOV: Int = 250;
@Argument(shortName = "minimalVCF", doc="", required=false)
val minimalVCF: Boolean = false;
val dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/bundle/current/b37/dbsnp_132.b37.vcf")
trait UNIVERSAL_GATK_ARGS extends CommandLineGATK {
this.logging_level = "INFO";
this.jarFile = gatkJarFile;
this.reference_sequence = referenceFile;
this.memoryLimit = 4
}
trait CoFoJa extends JavaCommandLineFunction {
override def javaOpts = super.javaOpts + " -javaagent:lib/cofoja.jar"
}
def script = {
val reducedBAM = new ReduceBAM(bam)
add(reducedBAM)
callAndEvaluateBAM(reducedBAM.out)
val slicedBAM = new SliceBAM(bam)
add(slicedBAM)
callAndEvaluateBAM(slicedBAM.out)
}
def callAndEvaluateBAM(bam: File) = {
val rawVCF = new Call(bam)
add(rawVCF)
val filterSNPs = new VariantFiltration with UNIVERSAL_GATK_ARGS
filterSNPs.variantVCF = rawVCF.out
filterSNPs.filterName = List("SNP_SB", "SNP_QD", "SNP_HRun")
filterSNPs.filterExpression = List("\"SB>=0.10\"", "\"QD<5.0\"", "\"HRun>=4\"")
filterSNPs.clusterWindowSize = 10
filterSNPs.clusterSize = 3
filterSNPs.out = swapExt(rawVCF.out,".vcf",".filtered.vcf")
add(filterSNPs)
val targetEval = new VariantEval with UNIVERSAL_GATK_ARGS
targetEval.rodBind :+= RodBind("eval", "VCF", filterSNPs.out)
if ( dbSNP.exists() )
targetEval.rodBind :+= RodBind("dbsnp", "VCF", dbSNP)
targetEval.doNotUseAllStandardStratifications = true
targetEval.doNotUseAllStandardModules = true
targetEval.evalModule = List("SimpleMetricsByAC", "TiTvVariantEvaluator", "CountVariants")
targetEval.stratificationModule = List("EvalRod", "CompRod", "Novelty", "Filter")
targetEval.out = swapExt(filterSNPs.out,".vcf",".eval")
add(targetEval)
// for convenient diffing
add(new DiffableTable(rawVCF.out))
add(new DiffableTable(filterSNPs.out))
}
class ReduceBAM(bam: File) extends ReduceReads with UNIVERSAL_GATK_ARGS with CoFoJa {
this.memoryLimit = 3
this.input_file = List(bam)
this.o = swapExt(bam,".bam",".reduced.bam")
this.CS = 20
this.mravs = 50
this.mbrc = 10000
if ( REDUCE_INTERVAL != null )
this.intervalsString = List(REDUCE_INTERVAL);
}
class SliceBAM(bam: File) extends PrintReads with UNIVERSAL_GATK_ARGS {
this.memoryLimit = 3
this.input_file = List(bam)
this.o = swapExt(bam,".bam",".printreads.bam")
if ( REDUCE_INTERVAL != null )
this.intervalsString = List(REDUCE_INTERVAL);
}
class Call(@Input(doc="foo") bam: File) extends UnifiedGenotyper with UNIVERSAL_GATK_ARGS {
@Output(doc="foo") var outVCF: File = swapExt(bam,".bam",".vcf")
this.input_file = List(bam)
this.stand_call_conf = 50.0
this.stand_emit_conf = 50.0
this.dcov = DCOV;
this.o = outVCF
if ( minimalVCF )
this.group = List("none")
if ( CALLING_INTERVAL != null ) {
this.intervalsString = List(CALLING_INTERVAL)
}
}
class DiffableTable(@Input vcf: File) extends CommandLineFunction {
@Output var out: File = swapExt(vcf,".vcf",".table")
def commandLine = "cut -f 1,2,4,5,7 %s > %s".format(vcf, out)
}
}