diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/BaseCounts.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/BaseCounts.java index d3812d678..26fc65291 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/BaseCounts.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/BaseCounts.java @@ -1,8 +1,12 @@ package org.broadinstitute.sting.playground.gatk.walkers.reducereads; +import org.broadinstitute.sting.alignment.reference.bwt.Counts; import org.broadinstitute.sting.utils.BaseUtils; import com.google.java.contract.*; +import java.util.EnumMap; +import java.util.Map; + /* * Copyright (c) 2009 The Broad Institute * @@ -35,40 +39,49 @@ import com.google.java.contract.*; * Time: 2:55 PM */ final class BaseCounts { - private final int counts[] = new int[4]; // todo -- fixme -- include - and I events + public final static BaseIndex MAX_BASE_INDEX_WITH_NO_COUNTS = BaseIndex.A; + public final static byte MAX_BASE_WITH_NO_COUNTS = MAX_BASE_INDEX_WITH_NO_COUNTS.getByte(); - public void incr(byte base) { - int baseI = BaseUtils.simpleBaseToBaseIndex(base); - if ( baseI >= 0 ) // no Ns - counts[baseI]++; + private final Map counts = new EnumMap(BaseIndex.class); // todo -- fixme -- include - and I events + { + for ( BaseIndex i : BaseIndex.values() ) + counts.put(i,0); + } + + @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1") + public void incr(byte base) { + BaseIndex i = BaseIndex.byteToBase(base); + if ( i != null ) // no Ns + counts.put(i, counts.get(i) + 1); } - @Ensures("BaseUtils.isRegularBase(result)") public byte baseWithMostCounts() { - return BaseUtils.baseIndexToSimpleBase(maxBaseIndex()); + return maxBaseIndex().getByte(); } @Ensures("result >= 0") public int countOfMostCommonBase() { - return counts[maxBaseIndex()]; + return counts.get(maxBaseIndex()); } @Ensures("result >= 0") - public int totalCounts() { + public int totalCount() { int sum = 0; - for ( int c : counts ) { + for ( int c : counts.values() ) { sum += c; } return sum; } - @Ensures("result >= 0 && result < counts.length") - private int maxBaseIndex() { - int maxI = 0; - for ( int i = 0; i < counts.length; i++) { - if ( counts[i] > counts[maxI] ) { + @Ensures({ + "result != null", + "totalCount() != 0 || result == MAX_BASE_INDEX_WITH_NO_COUNTS"}) + private BaseIndex maxBaseIndex() { + BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS; + for ( BaseIndex i : counts.keySet() ) { + if ( counts.get(i) > counts.get(maxI) ) { maxI = i; } } @@ -78,9 +91,39 @@ final class BaseCounts { @Ensures("result != null") public String toString() { StringBuilder b = new StringBuilder(); - for ( int i = 0; i < counts.length; i++ ) { - b.append((char)BaseUtils.baseIndexToSimpleBase(i)).append("=").append(counts[i]).append(","); + for ( Map.Entry elt : counts.entrySet() ) { + b.append(elt.toString()).append("=").append(elt.getValue()).append(","); } return b.toString(); } + + private enum BaseIndex { + A ( 'A', 0 ), + C ( 'C', 1 ), + G ( 'G', 2 ), + T ( 'T', 3 ), + D ( 'D', 4 ), + I ( 'I', 5 ); // insertion to the right of the base + + final byte b; + final int index; + private BaseIndex(char base, int index) { + this.b = (byte)base; + this.index = index; + } + + public byte getByte() { return b; } + + public static final BaseIndex byteToBase(final byte base) { + switch (base) { + case 'A': return A; + case 'C': return C; + case 'G': return G; + case 'T': return T; + case 'D': return D; + case 'I': return I; + default: return null; + } + } + } } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/ConsensusSite.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/ConsensusSite.java index 3b7d239e3..6891d3b4f 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/ConsensusSite.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/ConsensusSite.java @@ -39,29 +39,29 @@ import java.util.Set; */ /** -* Created by IntelliJ IDEA. -* User: depristo -* Date: 4/8/11 -* Time: 3:01 PM -* -*/ + * Created by IntelliJ IDEA. + * User: depristo + * Date: 4/8/11 + * Time: 3:01 PM + * + * Represents a single base site in the consensus calculation. A site corresponds to a place + * on the reference genome, or is a dummy site that is only used to calculate insertion statistics + */ final class ConsensusSite { - final GenomeLoc loc; final Set overlappingReads = new HashSet(); - final int offset; + final int offset, position; final BaseCounts counts = new BaseCounts(); ConsensusSpan.Type markedType = null; - public ConsensusSite(GenomeLoc loc, int offset) { - this.loc = loc; - + public ConsensusSite(int position, int offset) { + this.position = position; this.offset = offset; } - public GenomeLoc getLoc() { - return loc; + public int getPosition() { + return position; } public Set getOverlappingReads() { @@ -75,7 +75,7 @@ final class ConsensusSite { public boolean isStrongConsensus(final double maxFractionDisagreeingBases) { int mostCommon = counts.countOfMostCommonBase(); - int total = counts.totalCounts(); + int total = counts.totalCount(); double fractionCommonBase = (1.0 * mostCommon) / total; return (1 - fractionCommonBase) < maxFractionDisagreeingBases; } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/ReducingSAMFileWriter.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/ReducingSAMFileWriter.java deleted file mode 100644 index 6d683e642..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/ReducingSAMFileWriter.java +++ /dev/null @@ -1,287 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers.reducereads; - -import net.sf.picard.sam.SamPairUtil; -import net.sf.samtools.*; -import org.apache.log4j.Logger; -import org.broad.tribble.util.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import java.io.File; -import java.util.*; - -//import org.broadinstitute.sting.utils.SimpleTimer; - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * - * @author depristo - * @version 0.1 - */ -public class ReducingSAMFileWriter implements SAMFileWriter { - protected static final Logger logger = Logger.getLogger(ReducingSAMFileWriter.class); - private static final boolean DEBUG = false; - private static final boolean INVERT = false; - private static final boolean PRINT_CONSENSUS_READS = false; - - /** The place where we ultimately write out our records */ - final SAMFileWriter finalDestination; - Queue waitingReads = new LinkedList(); - LinkedList consensusReads = new LinkedList(); - final int IndelContextSize; - final int SNPContextSize; - Set VCs = new HashSet(); - - public long getNCompressedReads() { - return nCompressedReads; - } - - long nCompressedReads = 0; - - /** - * - * @param header - * @param outputFile - * @param compressionLevel - */ - public ReducingSAMFileWriter(final SAMFileHeader header, - final File outputFile, - final int compressionLevel, - final int SNPContextSize, - final int IndelContextSize) { - this(new SAMFileWriterFactory().makeBAMWriter(header, true, outputFile, compressionLevel), - SNPContextSize, - IndelContextSize); - } - - public ReducingSAMFileWriter(final SAMFileWriter finalDestination, - final int SNPContextSize, - final int IndelContextSize) { - this.finalDestination = finalDestination; - this.IndelContextSize = IndelContextSize; - this.SNPContextSize = SNPContextSize; - } - - public int getIndelContextSize() { - return IndelContextSize; - } - - public int getSNPContextSize() { - return SNPContextSize; - } - - /** - * Retrieves the header to use when creating the new SAM file. - * @return header to use when creating the new SAM file. - */ - public SAMFileHeader getFileHeader() { - return finalDestination.getFileHeader(); - } - - /** - * @{inheritDoc} - */ - public void addAlignment( SAMRecord newRead ) { - if ( DEBUG ) logger.info("New read pos " + newRead.getAlignmentStart() + " OP = " + newRead.getAttribute("OP")); - - // if the new read is on a different contig, then we need to flush the queue and clear the map - if ( waitingReads.size() > 0 && waitingReads.peek().getReferenceIndex() != newRead.getReferenceIndex()) { - if ( DEBUG ) logger.warn("Flushing queue on move to new contig: " + newRead.getReferenceName()); - - // TODO -- fixme - while ( ! waitingReads.isEmpty() ) { - // emit to disk - finalDestination.addAlignment(waitingReads.remove()); - } - } - - waitingReads.add(newRead); - emitReadsIfPossible(newRead.getAlignmentStart()); - } - - public void addVariant(VariantContext vc) { - VCs.add(vc); - } - - private void emitReadsIfPossible(int alignmentStartOfLastRead) { - // - // 2 states: - // -- reads ending << vc.getLocation() - // -- read overlapping vc.getLocation() +/- context size - // - while ( ! waitingReads.isEmpty() ) { // there's something in the queue - SAMRecord read = waitingReads.peek(); - if ( ! withinContextOfVariants(read, VCs) ) { - addToConsensus(read); - waitingReads.remove(); - } else if ( cannotOverlapFutureVC(read, alignmentStartOfLastRead) ) { - emitConsensus(); - if ( ! INVERT ) finalDestination.addAlignment(read); - waitingReads.remove(); - } else { - return; - } - } - } - - private void addToConsensus(SAMRecord read) { - if ( ! read.getDuplicateReadFlag() && ! read.getNotPrimaryAlignmentFlag() && ! read.getReadUnmappedFlag() ) - consensusReads.add(read); - } - - private void emitConsensus() { - if ( ! consensusReads.isEmpty() ) { - SAMRecord firstRead = consensusReads.peek(); - - int start = firstRead.getAlignmentStart(); - int end = furtherestEnd(consensusReads); - int len = end - start + 1; - - int[][] baseCounts = new int[len][4]; - for ( SAMRecord read : consensusReads ) { - int readI = 0, refI = read.getAlignmentStart() - start; - for ( CigarElement elt : read.getCigar().getCigarElements() ) { - int l = elt.getLength(); - switch (elt.getOperator()) { - case N: // cannot handle these - break; - case H : case P : // ignore pads and hard clips - break; - case S : refI += l; // move the reference too, in addition to I - case I : - // todo - handle insertions? - readI += l; - break; - case D : - refI += l; - break; - case M : - while (readI < l) { - byte base = read.getReadBases()[readI++]; - int baseI = BaseUtils.simpleBaseToBaseIndex(base); - if ( baseI >= 0 ) // no Ns - baseCounts[refI][baseI]++; - refI++; - } - break; - default: - throw new ReviewedStingException("BUG: Unexpected CIGAR element " + elt + " in read " + read.getReadName()); - } - } - } - - byte[] bases = new byte[len]; - byte[] quals = new byte[len]; - for ( int i = 0; i < len; i++) { - final int maxI = maxBaseCount(baseCounts[i]); - final int count = baseCounts[i][maxI]; - final byte base = count == 0 ? (byte)'N' : BaseUtils.baseIndexToSimpleBase(maxI); - bases[i] = base; - quals[i] = QualityUtils.boundQual(count, (byte)64); - } - - SAMRecord consensus = new SAMRecord(firstRead.getHeader()); - consensus.setReferenceIndex(firstRead.getReferenceIndex()); - consensus.setReadName("Mark"); - consensus.setCigarString(String.format("%dM", len)); - consensus.setReadPairedFlag(false); - consensus.setAlignmentStart(start); - consensus.setReadBases(bases); - consensus.setBaseQualities(quals); - consensus.setMappingQuality(60); - - int nConsensus = consensusReads.size(); - nCompressedReads += nConsensus; - logger.info(String.format("Compressing %5d reads into a single consensus", nConsensus)); - finalDestination.addAlignment(consensus); - - if ( INVERT && PRINT_CONSENSUS_READS ) - for ( SAMRecord read : consensusReads ) - finalDestination.addAlignment(read); - - consensusReads.clear(); - } - } - - private static int furtherestEnd(Collection reads) { - int end = -1; - for ( SAMRecord read : reads ) { - end = Math.max(end, read.getAlignmentEnd()); - } - return end; - } - - private int maxBaseCount(int[] counts) { - int maxI = 0; - for ( int i = 0; i < counts.length; i++) { - if ( counts[i] > counts[maxI] ) { - maxI = i; - } - } - return maxI; - } - - private boolean cannotOverlapFutureVC(SAMRecord read, int alignmentStartOfLastRead) { - return read.getAlignmentEnd() < alignmentStartOfLastRead - Math.max(SNPContextSize, IndelContextSize); - } - - private boolean withinContextOfVariants(SAMRecord read, Collection vcs) { - for ( VariantContext vc : vcs ) { - if ( withinContextOfVariant(read, vc) ) { - return true; - } - } - - return false; - } - - private boolean withinContextOfVariant(SAMRecord read, VariantContext vc) { - if ( ! read.getReferenceName().equals(vc.getChr()) ) - return false; - else if ( vc.isVariant() ) { - int contextSize = vc.isSNP() ? SNPContextSize : IndelContextSize; - int vcContextStart = vc.getStart() - contextSize; - int vcContextEnd = vc.getEnd() + contextSize; - boolean notInContext = read.getAlignmentEnd() < vcContextStart || read.getAlignmentStart() > vcContextEnd; - return ! notInContext; - } else { - return false; - } - } - - /** - * @{inheritDoc} - */ - public void close() { - // write out all of the remaining reads - while ( ! waitingReads.isEmpty() ) { // there's something in the queue - finalDestination.addAlignment(waitingReads.remove()); - } - finalDestination.close(); - } -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/RefPileupElement.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/RefPileupElement.java index 302eb18e7..af06ba952 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/RefPileupElement.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/RefPileupElement.java @@ -65,6 +65,8 @@ public class RefPileupElement extends PileupElement { public Iterator iterator() { List elts = new ArrayList(); + // todo -- need to be ++X not X++ operators. The refI should go from -1, for reads like 2I2M, + // todo -- so that we can represent insertions to the left of the read int readI = 0, refI = read.getAlignmentStart() - refIStart; for ( CigarElement elt : read.getCigar().getCigarElements() ) { int l = elt.getLength(); @@ -80,7 +82,9 @@ public class RefPileupElement extends PileupElement { case I : for ( int i = 0; i < l; i++) if ( refI >= 0 ) - elts.add(new RefPileupElement(read, readI++, refI)); + readI++; + // todo -- replace when insertions handled correctly +// elts.add(new RefPileupElement(read, readI++, refI)); break; case D : for ( int i = 0; i < l; i++) diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/SingleSampleConsensusReadCompressor.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/SingleSampleConsensusReadCompressor.java index ef1c912d3..a0d655528 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/SingleSampleConsensusReadCompressor.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/SingleSampleConsensusReadCompressor.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.playground.gatk.walkers.reducereads; import net.sf.samtools.*; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.QualityUtils; @@ -264,7 +265,7 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres } private ConsensusSpan findSpan(List sites, int start, ConsensusSpan.Type consensusType) { - int refStart = sites.get(0).getLoc().getStart(); + int refStart = sites.get(0).getPosition(); for ( int end = start + 1; end < sites.size(); end++ ) { ConsensusSite site = sites.get(end); @@ -282,6 +283,22 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres private List calculateConsensusSites(Collection reads, boolean useAllRemainingReads, GenomeLoc lastProcessedRegion) { + List consensusSites = createEmptyConsensusSites(reads, lastProcessedRegion); + int refStart = consensusSites.get(0).getPosition(); + + for ( SAMRecord read : reads ) { + for ( RefPileupElement p : RefPileupElement.walkRead(read, refStart) ) { + // add to the consensus at this site + if ( p.getRefOffset() >= consensusSites.size() ) + throw new ReviewedStingException("BUG: ref offset off the consensus site list: " + p.getRead() + " at " + p.getRefOffset()); + consensusSites.get(p.getRefOffset()).addOverlappingRead(p); + } + } + + return consensusSites; + } + + private static List createEmptyConsensusSites(Collection reads, GenomeLoc lastProcessedRegion) { SAMRecord firstRead = reads.iterator().next(); int minStart = lastProcessedRegion == null ? -1 : lastProcessedRegion.getStop() + 1; @@ -293,16 +310,9 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres List consensusSites = new ArrayList(); int len = refEnd - refStart + 1; for ( int i = 0; i < len; i++ ) { - int l = refStart + i; - GenomeLoc loc = glParser.createGenomeLoc(contig, l, l); - consensusSites.add(new ConsensusSite(loc, i)); - } - - for ( SAMRecord read : reads ) { - for ( RefPileupElement p : RefPileupElement.walkRead(read, refStart) ) { - // add to the consensus at this site - consensusSites.get(p.getRefOffset()).addOverlappingRead(p); - } + int position = refStart + i; + //GenomeLoc loc = glParser.createGenomeLoc(contig, l, l); + consensusSites.add(new ConsensusSite(position, i)); } return consensusSites; @@ -345,7 +355,13 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres if ( site.getMarkedType() == ConsensusSpan.Type.VARIABLE ) throw new ReviewedStingException("Variable site included in consensus: " + site); final int count = site.counts.countOfMostCommonBase(); - final byte base = count == 0 ? (byte)'N' : site.counts.baseWithMostCounts(); + byte base = count == 0 ? (byte)'N' : site.counts.baseWithMostCounts(); + if ( !BaseUtils.isRegularBase(base) ) { + // todo -- this code needs to be replaced with cigar building code as well + logger.warn("Substituting N for non-regular consensus base " + (char)base); + base = (byte)'N'; + } + bases[i] = base; quals[i] = QualityUtils.boundQual(count, (byte)64); } diff --git a/java/test/org/broadinstitute/sting/playground/gatk/walkers/reducereads/BaseCountsUnitTest.java b/java/test/org/broadinstitute/sting/playground/gatk/walkers/reducereads/BaseCountsUnitTest.java new file mode 100644 index 000000000..6f5e1ad49 --- /dev/null +++ b/java/test/org/broadinstitute/sting/playground/gatk/walkers/reducereads/BaseCountsUnitTest.java @@ -0,0 +1,73 @@ +// our package +package org.broadinstitute.sting.playground.gatk.walkers.reducereads; + + +// the imports for unit testing. + + +import net.sf.samtools.SAMRecord; +import org.apache.commons.lang.StringUtils; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.List; + +/** + * Basic unit test for BaseCounts in reduced reads + */ +public class BaseCountsUnitTest extends BaseTest { + private class SingleTest { + public String bases; + public byte mostCountBase; + public int mostCommonCount; + + private SingleTest(String bases, char mostCountBase, int mostCommonCount) { + this.mostCommonCount = mostCommonCount; + this.mostCountBase = (byte)mostCountBase; + this.bases = bases; + } + } + + + @DataProvider(name = "data") + public Object[][] createData1() { + List params = new ArrayList(); + + params.add(new SingleTest("A", 'A', 1 )); + params.add(new SingleTest("AA", 'A', 2 )); + params.add(new SingleTest("AC", 'A', 1 )); + params.add(new SingleTest("AAC", 'A', 2 )); + params.add(new SingleTest("AAA", 'A', 3 )); + params.add(new SingleTest("AAAN", 'A', 3 )); + params.add(new SingleTest("AAANNNN", 'A', 3 )); + params.add(new SingleTest("AACTG", 'A', 2 )); + params.add(new SingleTest("D", 'D', 1 )); + params.add(new SingleTest("DDAAD", 'D', 3)); + params.add(new SingleTest("", (char)BaseCounts.MAX_BASE_WITH_NO_COUNTS, 0 )); + params.add(new SingleTest("AAIIIAI", 'I', 4 )); + + List params2 = new ArrayList(); + for ( SingleTest x : params ) params2.add(new Object[]{x}); + return params2.toArray(new Object[][]{}); + } + + + + @Test(dataProvider = "data", enabled = true) + public void testCounting(SingleTest params) { + BaseCounts counts = new BaseCounts(); + + for ( byte base : params.bases.getBytes() ) + counts.incr(base); + + String name = String.format("Test-%s", params.bases); + Assert.assertEquals(counts.totalCount(), params.bases.length() - StringUtils.countMatches(params.bases, "N"), name); + Assert.assertEquals(counts.countOfMostCommonBase(), params.mostCommonCount, name); + Assert.assertEquals((char)counts.baseWithMostCounts(), (char)params.mostCountBase, name); + } +} \ No newline at end of file diff --git a/scala/qscript/oneoffs/depristo/PrepareBamsForHomogeneityTesting.scala b/scala/qscript/oneoffs/depristo/PrepareBamsForHomogeneityTesting.scala new file mode 100755 index 000000000..e5bdb16eb --- /dev/null +++ b/scala/qscript/oneoffs/depristo/PrepareBamsForHomogeneityTesting.scala @@ -0,0 +1,81 @@ +package oneoffs.depristo + +import org.broadinstitute.sting.queue.extensions.gatk._ +import org.broadinstitute.sting.queue.QScript +import org.broadinstitute.sting.utils.clipreads.ClippingRepresentation +import scala.io.Source._ +import org.broadinstitute.sting.queue.function.JavaCommandLineFunction + + +class PrepareBamsForHomogeneityTesting extends QScript { + qscript => + @Argument(doc="Path to GATK jar",required=false,shortName="gatkjarfile") var gatkJarFile: File = new File("dist/GenomeAnalysisTK.jar") + @Argument(doc="Path to SamToFastq jar",required=false,shortName="SamToFastqjarfile") var SamToFastqJar: File = new File("/seq/software/picard/current/bin/SamToFastq.jar") + @Argument(doc="BAM File",required=true, shortName="bam") var bams: File = null + @Argument(doc="Bundle path",required=false, shortName="bundle") var bundle: File = new File("/humgen/gsa-hpprojects/GATK/bundle/current/b37/") + +// @Argument(shortName = "R", doc="ref", required=true) +// var referenceFile: File = bundle + "/human_g1k_v37.fasta" + + @Argument(shortName = "L", doc="intervals", required=false) + val TARGET_INTERVAL: String = null; + + trait GATKArgs extends CommandLineGATK { + this.logging_level = "INFO"; + this.jarFile = gatkJarFile; + if ( TARGET_INTERVAL != null ) + this.intervalsString = List(TARGET_INTERVAL); +// this.reference_sequence = referenceFile; + this.memoryLimit = 2 + } + +// class ClipBAM(@Input in: File, @Output out: File) extends ClipReadsWalker with GATKArgs { +// this.o = out +// this.CT = "1-25" +// this.CR = ClippingRepresentation.HARDCLIP_BASES +// this.OQ = true +// } +// +// case class revert (@Input inBam: File, @Output outBam: File) extends PicardBamFunction { +// @Output(doc="reverted bam index") var revertedBamIndex = new File(outBam + ".bai") +// override def inputBams = List(inBAM) +// override def outputBam = outBam +// override def commandLine = super.commandLine + " CREATE_INDEX=true " +// this.isIntermediate = true +// this.jarFile = qscript.dedupJar +// this.analysisName = queueLogDir + outBam + ".dedup" +// this.jobName = queueLogDir + outBam + ".dedup" +// } + + case class SamToFastq (@Input inBam: File, @Output fastq1: File, @Output fastq2: File, trim: Int) extends JavaCommandLineFunction { + this.jarFile = qscript.SamToFastqJar + override def commandLine = super.commandLine + + Array( + " INPUT=" + inBam, + " FASTQ=" + fastq1, + " VALIDATION_STRINGENCY=SILENT", + " SECOND_END_FASTQ=" + fastq2, + " READ1_TRIM=" + trim, + " READ2_TRIM=" + trim).mkString + + //this.analysisName = queueLogDir + outBam + ".dedup" + //this.jobName = queueLogDir + outBam + ".dedup" + } + + def createListFromFile(in: File):List[File] = { + if (in.toString.endsWith("bam")) + return List(in) + var l: List[File] = List() + for (bam <- fromFile(in).getLines) + l :+= new File(bam) + return l + } + + def script = { + for ( bam <- createListFromFile(bams) ) { + val fastq1 = swapExt(bam, ".bam", ".1.fastq") + val fastq2 = swapExt(bam, ".bam", ".2.fastq") + add(new SamToFastq(bam, fastq1, fastq2, 25)) + } + } +} \ No newline at end of file diff --git a/scala/qscript/oneoffs/depristo/ReducedBAMEvaluation.scala b/scala/qscript/oneoffs/depristo/ReducedBAMEvaluation.scala new file mode 100755 index 000000000..0a9fa300e --- /dev/null +++ b/scala/qscript/oneoffs/depristo/ReducedBAMEvaluation.scala @@ -0,0 +1,123 @@ +package oneoffs.depristo + +import org.broadinstitute.sting.queue.QScript +import org.broadinstitute.sting.queue.extensions.samtools.SamtoolsIndexFunction +import org.broadinstitute.sting.queue.extensions.gatk._ +import org.broadinstitute.sting.queue.function.JavaCommandLineFunction + +class ReducedBAMEvaluation extends QScript { + @Argument(doc="gatkJarFile", required=false) + var gatkJarFile: File = new File("/home/radon01/depristo/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar") + + @Argument(shortName = "R", doc="ref", required=false) + var referenceFile: File = new File("/humgen/1kg/reference/human_g1k_v37.fasta") + + @Argument(shortName = "bam", doc="BAM", required=true) + val bam: File = null; + + @Argument(shortName = "reduceIntervals", doc="intervals", required=false) + val REDUCE_INTERVAL: String = null; + + @Argument(shortName = "callingIntervals", doc="intervals", required=false) + val CALLING_INTERVAL: String = null; + + @Argument(shortName = "dcov", doc="dcov", required=false) + val DCOV: Int = 250; + + @Argument(shortName = "minimalVCF", doc="", required=false) + val minimalVCF: Boolean = false; + + val dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/bundle/current/b37/dbsnp_132.b37.vcf") + + trait UNIVERSAL_GATK_ARGS extends CommandLineGATK { + this.logging_level = "INFO"; + this.jarFile = gatkJarFile; + this.reference_sequence = referenceFile; + this.memoryLimit = 4 + } + + trait CoFoJa extends JavaCommandLineFunction { + override def javaOpts = super.javaOpts + " -javaagent:lib/cofoja.jar" + } + + def script = { + val reducedBAM = new ReduceBAM(bam) + add(reducedBAM) + callAndEvaluateBAM(reducedBAM.out) + + val slicedBAM = new SliceBAM(bam) + add(slicedBAM) + callAndEvaluateBAM(slicedBAM.out) + } + + def callAndEvaluateBAM(bam: File) = { + val rawVCF = new Call(bam) + add(rawVCF) + + val filterSNPs = new VariantFiltration with UNIVERSAL_GATK_ARGS + filterSNPs.variantVCF = rawVCF.out + filterSNPs.filterName = List("SNP_SB", "SNP_QD", "SNP_HRun") + filterSNPs.filterExpression = List("\"SB>=0.10\"", "\"QD<5.0\"", "\"HRun>=4\"") + filterSNPs.clusterWindowSize = 10 + filterSNPs.clusterSize = 3 + filterSNPs.out = swapExt(rawVCF.out,".vcf",".filtered.vcf") + add(filterSNPs) + + val targetEval = new VariantEval with UNIVERSAL_GATK_ARGS + targetEval.rodBind :+= RodBind("eval", "VCF", filterSNPs.out) + if ( dbSNP.exists() ) + targetEval.rodBind :+= RodBind("dbsnp", "VCF", dbSNP) + targetEval.doNotUseAllStandardStratifications = true + targetEval.doNotUseAllStandardModules = true + targetEval.evalModule = List("SimpleMetricsByAC", "TiTvVariantEvaluator", "CountVariants") + targetEval.stratificationModule = List("EvalRod", "CompRod", "Novelty", "Filter") + targetEval.out = swapExt(filterSNPs.out,".vcf",".eval") + add(targetEval) + + // for convenient diffing + add(new DiffableTable(rawVCF.out)) + add(new DiffableTable(filterSNPs.out)) + } + + class ReduceBAM(bam: File) extends ReduceReads with UNIVERSAL_GATK_ARGS with CoFoJa { + this.memoryLimit = 3 + this.input_file = List(bam) + this.o = swapExt(bam,".bam",".reduced.bam") + this.CS = 20 + this.mravs = 50 + this.mbrc = 10000 + + if ( REDUCE_INTERVAL != null ) + this.intervalsString = List(REDUCE_INTERVAL); + } + + class SliceBAM(bam: File) extends PrintReads with UNIVERSAL_GATK_ARGS { + this.memoryLimit = 3 + this.input_file = List(bam) + this.o = swapExt(bam,".bam",".printreads.bam") + if ( REDUCE_INTERVAL != null ) + this.intervalsString = List(REDUCE_INTERVAL); + } + + class Call(@Input(doc="foo") bam: File) extends UnifiedGenotyper with UNIVERSAL_GATK_ARGS { + @Output(doc="foo") var outVCF: File = swapExt(bam,".bam",".vcf") + this.input_file = List(bam) + this.stand_call_conf = 50.0 + this.stand_emit_conf = 50.0 + this.dcov = DCOV; + this.o = outVCF + + if ( minimalVCF ) + this.group = List("none") + + if ( CALLING_INTERVAL != null ) { + this.intervalsString = List(CALLING_INTERVAL) + } + } + + class DiffableTable(@Input vcf: File) extends CommandLineFunction { + @Output var out: File = swapExt(vcf,".vcf",".table") + def commandLine = "cut -f 1,2,4,5,7 %s > %s".format(vcf, out) + } +} +