From 80a4ce0edf56df5c0da42c4ddf936d7f2a9191a9 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 23 Jan 2012 09:52:07 -0500 Subject: [PATCH 01/22] Bugfix for incorrect error messages for missing BAMs and VCFs -- Missing BAMs were appearing as StingExceptions -- Missing VCFs were showing up as CommandLineErrors, but it's clearer for them to be CouldNotReadInputFile exceptions -- Added integration tests to ensure missing BAMs, VCFs, and -L files are properly thrown as CouldNotReadInputFile exceptions -- Added path to standard b37 BAM to BaseTest -- Cleaned up code in SAMDataSource, removing my parallel loading code as this just didn't prove to be useful. --- .../commandline/ArgumentTypeDescriptor.java | 9 +- .../gatk/datasources/reads/SAMDataSource.java | 101 +++++------------- .../org/broadinstitute/sting/BaseTest.java | 2 + .../gatk/EngineFeaturesIntegrationTest.java | 19 ++++ 4 files changed, 55 insertions(+), 76 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java index 31212a46f..94ed23caf 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java +++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java @@ -436,9 +436,12 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor { String.format("Failed to parse value %s for argument %s.", value, source.field.getName())); } catch (Exception e) { - throw new UserException.CommandLineException( - String.format("Failed to parse value %s for argument %s. Message: %s", - value, source.field.getName(), e.getMessage())); + if ( e instanceof UserException ) + throw ((UserException)e); + else + throw new UserException.CommandLineException( + String.format("Failed to parse value %s for argument %s. Message: %s", + value, source.field.getName(), e.getMessage())); } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index 2e243b847..bba5c21e2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -29,6 +29,7 @@ import net.sf.picard.sam.MergingSamRecordIterator; import net.sf.picard.sam.SamFileHeaderMerger; import net.sf.samtools.*; import net.sf.samtools.util.CloseableIterator; +import net.sf.samtools.util.RuntimeIOException; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.DownsamplingMethod; import org.broadinstitute.sting.gatk.ReadMetrics; @@ -49,6 +50,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.sam.GATKSamRecordFactory; import java.io.File; +import java.io.FileNotFoundException; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; import java.util.*; @@ -64,9 +66,6 @@ import java.util.concurrent.*; public class SAMDataSource { final private static GATKSamRecordFactory factory = new GATKSamRecordFactory(); - /** If true, we will load SAMReaders in parallel */ - final private static boolean USE_PARALLEL_LOADING = false; - /** Backing support for reads. */ protected final ReadProperties readProperties; @@ -726,74 +725,23 @@ public class SAMDataSource { int readerNumber = 1; final SimpleTimer timer = new SimpleTimer().start(); - if ( totalNumberOfFiles > 0 ) logger.info("Initializing SAMRecords " + (USE_PARALLEL_LOADING ? "in parallel" : "in serial")); - if ( ! USE_PARALLEL_LOADING ) { - final int tickSize = 50; - int nExecutedTotal = 0; - long lastTick = timer.currentTime(); - for(final SAMReaderID readerID: readerIDs) { - final ReaderInitializer init = new ReaderInitializer(readerID).call(); - if (threadAllocation.getNumIOThreads() > 0) { - inputStreams.put(init.readerID, init.blockInputStream); // get from initializer - } - - logger.debug(String.format("Processing file (%d of %d) %s...", readerNumber++, totalNumberOfFiles, readerID.samFile)); - readers.put(init.readerID,init.reader); - if ( ++nExecutedTotal % tickSize == 0) { - double tickInSec = (timer.currentTime() - lastTick) / 1000.0; - printReaderPerformance(nExecutedTotal, tickSize, totalNumberOfFiles, timer, tickInSec); - lastTick = timer.currentTime(); - } - } - } else { - final int N_THREADS = 8; - - final ExecutorService executor = Executors.newFixedThreadPool(N_THREADS); - final List inits = new ArrayList(totalNumberOfFiles); - Queue> futures = new LinkedList>(); - for (final SAMReaderID readerID: readerIDs) { - logger.debug("Enqueuing for initialization: " + readerID.samFile); - final ReaderInitializer init = new ReaderInitializer(readerID); - inits.add(init); - futures.add(executor.submit(init)); + if ( totalNumberOfFiles > 0 ) logger.info("Initializing SAMRecords in serial"); + final int tickSize = 50; + int nExecutedTotal = 0; + long lastTick = timer.currentTime(); + for(final SAMReaderID readerID: readerIDs) { + final ReaderInitializer init = new ReaderInitializer(readerID).call(); + if (threadAllocation.getNumIOThreads() > 0) { + inputStreams.put(init.readerID, init.blockInputStream); // get from initializer } - try { - final int MAX_WAIT = 30 * 1000; - final int MIN_WAIT = 1 * 1000; - - while ( ! futures.isEmpty() ) { - final int prevSize = futures.size(); - final double waitTime = prevSize * (0.5 / N_THREADS); // about 0.5 seconds to load each file - final int waitTimeInMS = Math.min(MAX_WAIT, Math.max((int) (waitTime * 1000), MIN_WAIT)); - Thread.sleep(waitTimeInMS); - - Queue> pending = new LinkedList>(); - for ( final Future initFuture : futures ) { - if ( initFuture.isDone() ) { - final ReaderInitializer init = initFuture.get(); - if (threadAllocation.getNumIOThreads() > 0) { - inputStreams.put(init.readerID, init.blockInputStream); // get from initializer - } - logger.debug(String.format("Processing file (%d of %d) %s...", readerNumber++, totalNumberOfFiles, init.readerID)); - readers.put(init.readerID, init.reader); - } else { - pending.add(initFuture); - } - } - - final int nExecutedTotal = totalNumberOfFiles - pending.size(); - final int nExecutedInTick = prevSize - pending.size(); - printReaderPerformance(nExecutedTotal, nExecutedInTick, totalNumberOfFiles, timer, waitTimeInMS / 1000.0); - futures = pending; - } - } catch ( InterruptedException e ) { - throw new ReviewedStingException("Interrupted SAMReader initialization", e); - } catch ( ExecutionException e ) { - throw new ReviewedStingException("Execution exception during SAMReader initialization", e); + logger.debug(String.format("Processing file (%d of %d) %s...", readerNumber++, totalNumberOfFiles, readerID.samFile)); + readers.put(init.readerID,init.reader); + if ( ++nExecutedTotal % tickSize == 0) { + double tickInSec = (timer.currentTime() - lastTick) / 1000.0; + printReaderPerformance(nExecutedTotal, tickSize, totalNumberOfFiles, timer, tickInSec); + lastTick = timer.currentTime(); } - - executor.shutdown(); } if ( totalNumberOfFiles > 0 ) logger.info(String.format("Done initializing BAM readers: total time %.2f", timer.getElapsedTime())); @@ -913,12 +861,19 @@ public class SAMDataSource { public ReaderInitializer call() { final File indexFile = findIndexFile(readerID.samFile); - if (threadAllocation.getNumIOThreads() > 0) { - blockInputStream = new BlockInputStream(dispatcher,readerID,false); - reader = new SAMFileReader(blockInputStream,indexFile,false); + try { + if (threadAllocation.getNumIOThreads() > 0) { + blockInputStream = new BlockInputStream(dispatcher,readerID,false); + reader = new SAMFileReader(blockInputStream,indexFile,false); + } + else + reader = new SAMFileReader(readerID.samFile,indexFile,false); + } catch ( RuntimeIOException e ) { + if ( e.getCause() != null && e.getCause() instanceof FileNotFoundException ) + throw new UserException.CouldNotReadInputFile(readerID.samFile, e); + else + throw e; } - else - reader = new SAMFileReader(readerID.samFile,indexFile,false); reader.setSAMRecordFactory(factory); reader.enableFileSource(true); reader.setValidationStringency(validationStringency); diff --git a/public/java/test/org/broadinstitute/sting/BaseTest.java b/public/java/test/org/broadinstitute/sting/BaseTest.java index 8e218f950..61829dcfc 100755 --- a/public/java/test/org/broadinstitute/sting/BaseTest.java +++ b/public/java/test/org/broadinstitute/sting/BaseTest.java @@ -52,6 +52,8 @@ public abstract class BaseTest { public static final String comparisonDataLocation = GATKDataLocation + "Comparisons/"; public static final String annotationDataLocation = GATKDataLocation + "Annotations/"; + public static final String b37GoodBAM = validationDataLocation + "/CEUTrio.HiSeq.b37.chr20.10_11mb.bam"; + public static final String refseqAnnotationLocation = annotationDataLocation + "refseq/"; public static final String hg18Refseq = refseqAnnotationLocation + "refGene-big-table-hg18.txt"; public static final String hg19Refseq = refseqAnnotationLocation + "refGene-big-table-hg19.txt"; diff --git a/public/java/test/org/broadinstitute/sting/gatk/EngineFeaturesIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/EngineFeaturesIntegrationTest.java index 5b5083ef3..8cc2ffd96 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/EngineFeaturesIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/EngineFeaturesIntegrationTest.java @@ -54,4 +54,23 @@ public class EngineFeaturesIntegrationTest extends WalkerTest { @Test() private void testBadRODBindingInputTypeUnknownType() { testBadRODBindingInput("bedXXX", "Unknown input to VCF expecting walker", UserException.UnknownTribbleType.class); } + + private void testMissingFile(String name, String missingBinding) { + WalkerTestSpec spec = new WalkerTestSpec(missingBinding + " -R " + b37KGReference + " -o %s", + 1, UserException.CouldNotReadInputFile.class); + executeTest(name, spec); + } + + @Test() private void testMissingBAMnt1() { + testMissingFile("missing BAM", "-T UnifiedGenotyper -I missing.bam -nt 1"); + } + @Test() private void testMissingBAMnt4() { + testMissingFile("missing BAM", "-T UnifiedGenotyper -I missing.bam -nt 4"); + } + @Test() private void testMissingVCF() { + testMissingFile("missing VCF", "-T SelectVariants -V missing.vcf"); + } + @Test() private void testMissingInterval() { + testMissingFile("missing interval", "-T UnifiedGenotyper -L missing.interval_list -I " + b37GoodBAM); + } } \ No newline at end of file From 798596257b76d51730b8948c9e35624faad8bb44 Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Mon, 23 Jan 2012 10:50:16 -0500 Subject: [PATCH 02/22] Enable the Genotype Phasing Evaluator. Because it didn't have the same argument structure as the base class, update2 of VariantEvaluator was being called, rather than update2 of the actual module. --- .../varianteval/evaluators/GenotypePhasingEvaluator.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypePhasingEvaluator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypePhasingEvaluator.java index ea12ada48..07cd95997 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypePhasingEvaluator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypePhasingEvaluator.java @@ -80,6 +80,10 @@ public class GenotypePhasingEvaluator extends VariantEvaluator { return getName() + ": "; } + public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + return update2(eval,comp,tracker,ref,context,null); + } + public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context, NewEvaluationContext group) { //public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context, VariantEvalWalker.EvaluationContext group) { Reasons interesting = new Reasons(); From c18beadbdb4b864d4e04d453a6edebac7ab0818a Mon Sep 17 00:00:00 2001 From: Khalid Shakir Date: Mon, 23 Jan 2012 16:17:04 -0500 Subject: [PATCH 06/22] Device files like /dev/null are now tracked as special by Queue and are not used to generate .out file paths, scattered into a temporary directory, gathered, deleted, etc. Attempted workaround for xdr_resourceInfoReq unsatisfied link during loading of libbat.so. --- .../sting/jna/lsf/v7_0_6/LibBat.java | 24 ++++--- .../gatk/ArgumentDefinitionField.java | 6 +- .../sting/utils/io/IOUtils.java | 19 +++++- .../sting/utils/io/IOUtilsUnitTest.java | 36 ++++++++++ .../qscripts/examples/DevNullOutput.scala | 49 ++++++++++++++ .../sting/queue/function/QFunction.scala | 8 ++- .../ScatterGatherableFunction.scala | 10 +-- .../examples/DevNullOutputPipelineTest.scala | 67 +++++++++++++++++++ .../ExampleUnifiedGenotyperPipelineTest.scala | 1 + 9 files changed, 199 insertions(+), 21 deletions(-) create mode 100644 public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/DevNullOutput.scala create mode 100644 public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/DevNullOutputPipelineTest.scala diff --git a/public/java/src/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBat.java b/public/java/src/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBat.java index d7b34a253..f948a9bcf 100644 --- a/public/java/src/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBat.java +++ b/public/java/src/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBat.java @@ -71,6 +71,14 @@ import org.broadinstitute.sting.jna.clibrary.LibC; public class LibBat { static { + // via Platform LSF Configuration Reference, by default quiet the BSUB output. + if ("Y".equals(System.getProperty("BSUB_QUIET", "Y"))) + LibC.setenv("BSUB_QUIET", "Y", 1); + String lsfLibDir = System.getenv("LSF_LIBDIR"); + if (lsfLibDir != null) { + NativeLibrary.addSearchPath("lsf", lsfLibDir); + NativeLibrary.addSearchPath("bat", lsfLibDir); + } /* LSF 7.0.6 on the mac is missing the unsatisfied exported symbol for environ which was removed on MacOS X 10.5+. nm $LSF_LIBDIR/liblsf.dylib | grep environ @@ -79,16 +87,14 @@ public class LibBat { */ if (Platform.isMac()) NativeLibrary.getInstance("environhack"); - String lsfLibDir = System.getenv("LSF_LIBDIR"); - if (lsfLibDir != null) { - NativeLibrary.addSearchPath("lsf", lsfLibDir); - NativeLibrary.addSearchPath("bat", lsfLibDir); - } - NativeLibrary.getInstance("lsf"); - // via Platform LSF Configuration Reference, by default quiet the BSUB output. - if ("Y".equals(System.getProperty("BSUB_QUIET", "Y"))) - LibC.setenv("BSUB_QUIET", "Y", 1); + NativeLibrary liblsf = NativeLibrary.getInstance("lsf"); Native.register("bat"); + // HACK: Running into a weird error: + // java.lang.UnsatisfiedLinkError: Unable to load library 'bat': <$LSF_LIBDIR>/libbat.so: undefined symbol: xdr_resourceInfoReq + // This function is very clearly unsatisfied by running 'nm $LSF_LIBDIR/libbat.so | grep xdr_resourceInfoReq' but is + // found in liblsf.so when running 'nm $LSF_LIBDIR/liblsf.so | grep xdr_resourceInfoReq'. For now holding on to a reference + // to the LSF lib just in case this is a problem with the NativeLibrary's internal WeakReferences and the library being unloaded? + liblsf.getFunction("xdr_resourceInfoReq").getName(); } // Via support@platform.com: diff --git a/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/ArgumentDefinitionField.java b/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/ArgumentDefinitionField.java index 71640c66a..00a6ac1ae 100644 --- a/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/ArgumentDefinitionField.java +++ b/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/ArgumentDefinitionField.java @@ -468,7 +468,7 @@ public abstract class ArgumentDefinitionField extends ArgumentField { } @Override protected String getFreezeFields() { return String.format( - ("if (%2$s != null)%n" + + ("if (%2$s != null && !org.broadinstitute.sting.utils.io.IOUtils.isSpecialFile(%2$s))%n" + " if (!org.broadinstitute.sting.gatk.io.stubs.VCFWriterArgumentTypeDescriptor.isCompressed(%2$s.getPath))%n" + " %1$s = new File(%2$s.getPath + \"%3$s\")%n"), auxFieldName, originalFieldName, Tribble.STANDARD_INDEX_EXTENSION); @@ -481,7 +481,7 @@ public abstract class ArgumentDefinitionField extends ArgumentField { } @Override protected String getFreezeFields() { return String.format( - ("if (%2$s != null)%n" + + ("if (%2$s != null && !org.broadinstitute.sting.utils.io.IOUtils.isSpecialFile(%2$s))%n" + " if (!%3$s)%n" + " %1$s = new File(%2$s.getPath.stripSuffix(\".bam\") + \"%4$s\")%n"), auxFieldName, originalFieldName, SAMFileWriterArgumentTypeDescriptor.DISABLE_INDEXING_FULLNAME, BAMIndex.BAMIndexSuffix); @@ -494,7 +494,7 @@ public abstract class ArgumentDefinitionField extends ArgumentField { } @Override protected String getFreezeFields() { return String.format( - ("if (%2$s != null)%n" + + ("if (%2$s != null && !org.broadinstitute.sting.utils.io.IOUtils.isSpecialFile(%2$s))%n" + " if (%3$s)%n" + " %1$s = new File(%2$s.getPath + \"%4$s\")%n"), auxFieldName, originalFieldName, SAMFileWriterArgumentTypeDescriptor.ENABLE_MD5_FULLNAME, ".md5"); diff --git a/public/java/src/org/broadinstitute/sting/utils/io/IOUtils.java b/public/java/src/org/broadinstitute/sting/utils/io/IOUtils.java index b3fdb93d3..a5ba857ef 100644 --- a/public/java/src/org/broadinstitute/sting/utils/io/IOUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/io/IOUtils.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011, The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -37,6 +37,7 @@ import java.util.*; public class IOUtils { private static Logger logger = Logger.getLogger(IOUtils.class); + private static final File DEV_DIR = new File("/dev"); /** * Checks if the temp directory has been setup and throws an exception if they user hasn't set it correctly. @@ -301,12 +302,17 @@ public class IOUtils { } /** - * Tries to delete a file. Emits a warning if the file was unable to be deleted. + * Tries to delete a file. Emits a warning if the file + * is not a special file and was unable to be deleted. * * @param file File to delete. * @return true if the file was deleted. */ public static boolean tryDelete(File file) { + if (isSpecialFile(file)) { + logger.debug("Not trying to delete " + file); + return false; + } boolean deleted = FileUtils.deleteQuietly(file); if (deleted) logger.debug("Deleted " + file); @@ -385,4 +391,13 @@ public class IOUtils { } } + + /** + * Returns true if the file is a special file. + * @param file File path to check. + * @return true if the file is a special file. + */ + public static boolean isSpecialFile(File file) { + return file != null && (file.getAbsolutePath().startsWith("/dev/") || file.equals(DEV_DIR)); + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/io/IOUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/io/IOUtilsUnitTest.java index 4caf7f485..757e6efdf 100644 --- a/public/java/test/org/broadinstitute/sting/utils/io/IOUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/io/IOUtilsUnitTest.java @@ -1,3 +1,27 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.utils.io; import org.apache.commons.io.FileUtils; @@ -194,4 +218,16 @@ public class IOUtilsUnitTest extends BaseTest { Assert.assertEquals(resource.getPath(), "foo"); Assert.assertEquals(resource.getRelativeClass(), Resource.class); } + + @Test + public void testIsSpecialFile() { + Assert.assertTrue(IOUtils.isSpecialFile(new File("/dev"))); + Assert.assertTrue(IOUtils.isSpecialFile(new File("/dev/null"))); + Assert.assertTrue(IOUtils.isSpecialFile(new File("/dev/full"))); + Assert.assertTrue(IOUtils.isSpecialFile(new File("/dev/stdout"))); + Assert.assertTrue(IOUtils.isSpecialFile(new File("/dev/stderr"))); + Assert.assertFalse(IOUtils.isSpecialFile(null)); + Assert.assertFalse(IOUtils.isSpecialFile(new File("/home/user/my.file"))); + Assert.assertFalse(IOUtils.isSpecialFile(new File("/devfake/null"))); + } } diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/DevNullOutput.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/DevNullOutput.scala new file mode 100644 index 000000000..d891ebaaf --- /dev/null +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/DevNullOutput.scala @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.queue.qscripts.examples + +import org.broadinstitute.sting.queue.QScript +import org.broadinstitute.sting.queue.extensions.gatk._ + +/** + * Script used for testing output to /dev/null + */ +class DevNullOutput extends QScript { + @Input(doc="The reference file for the bam files.", shortName="R") + var referenceFile: File = _ + + @Input(doc="Bam file to genotype.", shortName="I") + var bamFile: File = _ + + def script() { + val genotyper = new UnifiedGenotyper + genotyper.reference_sequence = referenceFile + genotyper.memoryLimit = 2 + genotyper.scatterCount = 3 + genotyper.input_file :+= bamFile + genotyper.out = "/dev/null" + add(genotyper) + } +} diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala index dee1acfac..7d9debbdc 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala @@ -163,7 +163,9 @@ trait QFunction extends Logging with QJobReport { * Returns prefixes for hidden done/fail files. * @return prefixes. */ - private def statusPrefixes = statusPaths.map(file => file.getParentFile + "/." + file.getName) + private def statusPrefixes = statusPaths. + filter(file => !IOUtils.isSpecialFile(file)). + map(file => file.getParentFile + "/." + file.getName) /** * Returns the output files for this function. @@ -236,7 +238,7 @@ trait QFunction extends Logging with QJobReport { * Deletes the output files and all the status files for this function. */ def deleteOutputs() { - outputs.foreach(file => IOUtils.tryDelete(file)) + outputs.filter(file => !IOUtils.isSpecialFile(file)).foreach(file => IOUtils.tryDelete(file)) doneOutputs.foreach(file => IOUtils.tryDelete(file)) failOutputs.foreach(file => IOUtils.tryDelete(file)) } @@ -346,7 +348,7 @@ trait QFunction extends Logging with QJobReport { if (jobOutputFile == null) { jobOutputFile = firstOutput match { - case file: File => new File(file.getParentFile, file.getName + ".out") + case file: File if (!IOUtils.isSpecialFile(file)) => new File(file.getParentFile, file.getName + ".out") case _ => new File(jobName + ".out") } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/ScatterGatherableFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/ScatterGatherableFunction.scala index 921928bce..4578f0e82 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/ScatterGatherableFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/ScatterGatherableFunction.scala @@ -134,8 +134,10 @@ trait ScatterGatherableFunction extends CommandLineFunction { var gatherOutputs = ListMap.empty[ArgumentSource, File] var gatherAddOrder = numClones + 2 - // Only track fields that will have a value - val outputFieldsWithValues = this.outputFields.filter(hasFieldValue(_)) + // Only track fields that will have an output file + val outputFieldsWithValues = this.outputFields. + filter(hasFieldValue(_)). + filter(gatherField => !IOUtils.isSpecialFile(getFieldFile(gatherField))) for (gatherField <- outputFieldsWithValues) { gatherOutputs += gatherField -> getFieldFile(gatherField) @@ -175,9 +177,9 @@ trait ScatterGatherableFunction extends CommandLineFunction { cloneFunction.analysisName = this.analysisName cloneFunction.cloneIndex = i cloneFunction.commandDirectory = this.scatterGatherTempDir(dirFormat.format(i)) - cloneFunction.jobOutputFile = new File(this.jobOutputFile.getName) + cloneFunction.jobOutputFile = if (IOUtils.isSpecialFile(this.jobOutputFile)) this.jobOutputFile else new File(this.jobOutputFile.getName) if (this.jobErrorFile != null) - cloneFunction.jobErrorFile = new File(this.jobErrorFile.getName) + cloneFunction.jobErrorFile = if (IOUtils.isSpecialFile(this.jobErrorFile)) this.jobErrorFile else new File(this.jobErrorFile.getName) cloneFunction.addOrder = this.addOrder :+ (i+1) cloneFunction.isIntermediate = true diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/DevNullOutputPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/DevNullOutputPipelineTest.scala new file mode 100644 index 000000000..9bb287ac4 --- /dev/null +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/DevNullOutputPipelineTest.scala @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.queue.pipeline.examples + +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +import org.testng.annotations.Test +import org.broadinstitute.sting.queue.pipeline.{PipelineTest, PipelineTestSpec} +import org.broadinstitute.sting.BaseTest + +class DevNullOutputPipelineTest { + @Test + def testDevNullOutput() { + val spec = new PipelineTestSpec + spec.name = "devnulloutput" + spec.args = Array( + " -S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/DevNullOutput.scala", + " -R " + BaseTest.testDir + "exampleFASTA.fasta", + " -I " + BaseTest.testDir + "exampleBAM.bam").mkString + spec.jobRunners = PipelineTest.allJobRunners + PipelineTest.executeTest(spec) + } +} diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleUnifiedGenotyperPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleUnifiedGenotyperPipelineTest.scala index d50673a1a..f598402af 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleUnifiedGenotyperPipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleUnifiedGenotyperPipelineTest.scala @@ -39,6 +39,7 @@ class ExampleUnifiedGenotyperPipelineTest { " -I " + BaseTest.testDir + "exampleBAM.bam", " -filter QD", " -filterExpression 'QD < 2.0'").mkString + spec.jobRunners = PipelineTest.allJobRunners PipelineTest.executeTest(spec) } } From 2bb9525e7f9e505393651a2c47cfcb37d3dae075 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Mon, 23 Jan 2012 17:56:27 -0500 Subject: [PATCH 15/22] Don't set base qualities if fastQ is provided * Pacbio Processing pipeline now works with the new fastQ files outputted by the Pacbio instrument --- .../queue/qscripts/PacbioProcessingPipeline.scala | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala index 2f954713e..5cbea8ac4 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala @@ -1,7 +1,6 @@ package org.broadinstitute.sting.queue.qscripts import org.broadinstitute.sting.queue.QScript -import org.broadinstitute.sting.queue.extensions.gatk._ import org.broadinstitute.sting.queue.util.QScriptUtils import net.sf.samtools.SAMFileHeader.SortOrder import org.broadinstitute.sting.utils.exceptions.UserException @@ -60,12 +59,15 @@ class PacbioProcessingPipeline extends QScript { for (file: File <- fileList) { var USE_BWA: Boolean = false + var resetQuals: Boolean = true if (file.endsWith(".fasta") || file.endsWith(".fq")) { if (bwaPath == null) { throw new UserException("You provided a fasta/fastq file but didn't provide the path for BWA"); } USE_BWA = true + if (file.endsWith(".fq")) + resetQuals = false } // FASTA -> BAM steps @@ -99,7 +101,7 @@ class PacbioProcessingPipeline extends QScript { add(cov(bam, recalFile1), recal(bam, recalFile1, recalBam), - cov(recalBam, recalFile2), + cov(recalBam, recalFile2, resetQuals), analyzeCovariates(recalFile1, path1), analyzeCovariates(recalFile2, path2)) } @@ -158,8 +160,9 @@ class PacbioProcessingPipeline extends QScript { this.jobName = queueLogDir + outBam + ".rg" } - case class cov (inBam: File, outRecalFile: File) extends CountCovariates with CommandLineGATKArgs { - this.DBQ = dbq + case class cov (inBam: File, outRecalFile: File, resetQuals: Boolean) extends CountCovariates with CommandLineGATKArgs { + if (resetQuals) + this.DBQ = dbq this.knownSites :+= dbSNP this.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate") this.input_file :+= inBam From 945cf038895d930705bfa80cf80ab8e2c0de6743 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Mon, 23 Jan 2012 21:46:45 -0500 Subject: [PATCH 16/22] IntelliJ ate my import! --- .../sting/queue/qscripts/PacbioProcessingPipeline.scala | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala index 5cbea8ac4..d5f7512e4 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala @@ -6,6 +6,7 @@ import net.sf.samtools.SAMFileHeader.SortOrder import org.broadinstitute.sting.utils.exceptions.UserException import org.broadinstitute.sting.commandline.Hidden import org.broadinstitute.sting.queue.extensions.picard.{ReorderSam, SortSam, AddOrReplaceReadGroups} +import org.broadinstitute.sting.queue.extensions.gatk._ /** * Created by IntelliJ IDEA. @@ -99,9 +100,9 @@ class PacbioProcessingPipeline extends QScript { val bam = if (BLASR_BAM) {mqBAM} else {bamBase} - add(cov(bam, recalFile1), + add(cov(bam, recalFile1, resetQuals), recal(bam, recalFile1, recalBam), - cov(recalBam, recalFile2, resetQuals), + cov(recalBam, recalFile2, false), analyzeCovariates(recalFile1, path1), analyzeCovariates(recalFile2, path2)) } From 0a3172a9f1365cc9ed049173c468e1142de97396 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 24 Jan 2012 10:53:37 -0500 Subject: [PATCH 17/22] Fix for ref 0 bases for Chris -- Disturbingly, fixing this bug doesn't actually cause an test failures. -- Wrote a new QCRefWalker to actually check in detail that the reference bases coming into the RefWalker are all correct when comparing against a clean uncached load of the contig bases directly. -- However, I cannot run this tool due to some kind of weird BAM error -- sending this on to Matt --- .../sting/gatk/walkers/qc/QCRefWalker.java | 124 ++++++++++++++++++ .../CachingIndexedFastaSequenceFile.java | 2 +- 2 files changed, 125 insertions(+), 1 deletion(-) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/qc/QCRefWalker.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/QCRefWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/QCRefWalker.java new file mode 100644 index 000000000..bddf27d84 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/QCRefWalker.java @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.qc; + +import net.sf.picard.reference.IndexedFastaSequenceFile; +import net.sf.picard.reference.ReferenceSequence; +import net.sf.samtools.SAMSequenceRecord; +import org.broad.tribble.Feature; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Input; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.RefWalker; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.collections.ExpandingArrayList; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.StingException; + +import java.io.PrintStream; +import java.util.Collections; +import java.util.List; + +/** + * Prints out counts of the number of reference ordered data objects encountered. + * + * + *

Input

+ *

+ * One reference file only. And optionally -L intervals + *

+ * + *

Output

+ *

+ * If ok, nothing, else will throw an exception at the site where there's been a problem + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T QCRefWalker
+ * 
+ * + */ +public class QCRefWalker extends RefWalker { + @Output + public PrintStream out; + + String contigName = ""; + int contigStart, contigEnd; + IndexedFastaSequenceFile uncachedRef; + byte[] uncachedBases; + + @Override + public void initialize() { + super.initialize(); //To change body of overridden methods use File | Settings | File Templates. + uncachedRef = getToolkit().getReferenceDataSource().getReference(); + } + + private final void throwError(ReferenceContext ref, String message) { + throw new StingException(String.format("Site %s failed: %s", ref, message)); + } + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + final String locusContigName = ref.getLocus().getContig(); + if ( ! locusContigName.equals(contigName) ) { + contigName = locusContigName; + ReferenceSequence refSeq = uncachedRef.getSequence(contigName); + contigStart = 1; + contigEnd = contigStart + refSeq.length(); + uncachedBases = uncachedRef.getSubsequenceAt(contigName, contigStart, contigEnd).getBases(); + logger.warn(String.format("Loading contig %s (%d-%d)", contigName, contigStart, contigEnd)); + } + + final byte refBase = ref.getBase(); + if (! ( BaseUtils.isRegularBase(refBase) || BaseUtils.isNBase(refBase) ) ) + throwError(ref, String.format("Refbase isn't a regular base (%d %c)", refBase, (char)refBase)); + + // check bases are equal + final int pos = (int)context.getPosition() - contigStart; + if ( pos > contigEnd ) + throwError(ref, String.format("off contig (len=%d)", contigEnd)); + final byte uncachedBase = uncachedBases[pos]; + + if ( uncachedBase != refBase ) + throwError(ref, String.format("Provided refBase (%d %c) not equal to uncached one (%d %c)", + refBase, (char)refBase, uncachedBase, (char)uncachedBase)); + + return 1; + } + + public Integer reduceInit() { + return 0; + } + + public Integer reduce(Integer one, Integer sum) { + return one + sum; + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java b/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java index 43ef4aa74..44b586bcd 100644 --- a/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java +++ b/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java @@ -167,7 +167,7 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { if ( start < myCache.start || stop > myCache.stop || myCache.seq == null || myCache.seq.getContigIndex() != contigInfo.getSequenceIndex() ) { cacheMisses++; myCache.start = Math.max(start - cacheMissBackup, 0); - myCache.stop = Math.min(myCache.start + cacheSize, contigInfo.getSequenceLength()); + myCache.stop = Math.min(start + cacheSize + cacheMissBackup, contigInfo.getSequenceLength()); myCache.seq = super.getSubsequenceAt(contig, myCache.start, myCache.stop); //System.out.printf("New cache at %s %d-%d%n", contig, cacheStart, cacheStop); } else { From 7c7ca0d799256c484fa65ded9e1f2c04585c761f Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Tue, 24 Jan 2012 10:59:25 -0500 Subject: [PATCH 18/22] fixing bug with fastq extension * PPP only recognized .fasta and .fq, failing when the user provided a .fastq file. Fixed. --- .../org/broadinstitute/sting/queue/util/QScriptUtils.scala | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/QScriptUtils.scala b/public/scala/src/org/broadinstitute/sting/queue/util/QScriptUtils.scala index 5d76f39ed..1529d9951 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/QScriptUtils.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/QScriptUtils.scala @@ -45,9 +45,12 @@ object QScriptUtils { * to have empty lines and comment lines (lines starting with #). */ def createSeqFromFile(in: File):Seq[File] = { - // If the file provided ends with .bam, .fasta or .fq, it is not a bam list, we treat it as a single file. + // If the file provided ends with .bam, .fasta, fastq or .fq, it is not a bam list, we treat it as a single file. // and return a list with only this file. - if (in.toString.endsWith(".bam") || in.toString.endsWith(".fasta") || in.toString.endsWith(".fq")) + if (in.toString.toUpperCase.endsWith(".BAM") || + in.toString.toUpperCase.endsWith(".FASTA") || + in.toString.toUpperCase.endsWith(".FQ") || + in.toString.toUpperCase.endsWith("FASTQ") ) return Seq(in) var list: Seq[File] = Seq() From b07fdb108972fe40961a9d8e19019f754328429a Mon Sep 17 00:00:00 2001 From: David Roazen Date: Tue, 24 Jan 2012 14:53:31 -0500 Subject: [PATCH 19/22] Rename alltests* targets in build.xml "ant alltests" is now "ant committests" "ant alltests.public" is now "ant committests.public" "ant alltests.gatk.packagejar" is now "ant releasetests.gatk.packagejar" "ant alltests.queue.packagejar" is now "ant releasetests.queue.packagejar" This is going into both Stable + Unstable so that all Bamboo plans can be properly updated at the same time. --- build.xml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/build.xml b/build.xml index 2086d0c9a..abe3a32a1 100644 --- a/build.xml +++ b/build.xml @@ -962,19 +962,19 @@ - + - + - + @@ -983,7 +983,7 @@ - + From c312bd59600c941afc99b7708e156765d62ce3be Mon Sep 17 00:00:00 2001 From: Matt Hanna Date: Tue, 24 Jan 2012 15:30:04 -0500 Subject: [PATCH 20/22] Weirdly, PicardException inherits from SAMException, which means that our specialty code for reporting malformed BAMs was actually misreporting any error that happened in the Picard layer as a BAM ERROR. Specifically changing PicardException to report as a ReviewedStingException; we might want to change it in the future. I'll followup with the Picard team to make sure they really, really want PicardException to inherit from SAMException. --- .../org/broadinstitute/sting/gatk/CommandLineGATK.java | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java index b4d337d8d..9c59ffe9a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java +++ b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java @@ -25,6 +25,8 @@ package org.broadinstitute.sting.gatk; +import net.sf.picard.PicardException; +import net.sf.samtools.SAMException; import org.broad.tribble.TribbleException; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.ArgumentCollection; @@ -95,7 +97,11 @@ public class CommandLineGATK extends CommandLineExecutable { // We can generate Tribble Exceptions in weird places when e.g. VCF genotype fields are // lazy loaded, so they aren't caught elsewhere and made into User Exceptions exitSystemWithUserError(e); - } catch (net.sf.samtools.SAMException e) { + } catch(PicardException e) { + // TODO: Should Picard exceptions be, in general, UserExceptions or ReviewedStingExceptions? + exitSystemWithError(e); + } + catch (SAMException e) { checkForTooManyOpenFilesProblem(e.getMessage()); exitSystemWithSamError(e); } catch (Throwable t) { From ffd61f4c1cdbf95dafc14976a375d31cc00e40f9 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Tue, 17 Jan 2012 18:56:50 -0500 Subject: [PATCH 21/22] Refactor the Pileup Element with regards to indels Eric reported this bug due to the reduced reads failing with an index out of bounds on what we thought was a deletion, but turned out to be a read starting with insertion. * Refactored PileupElement to distinguish clearly between deletions and read starting with insertion * Modified ExtendedEventPileup to correctly distinguish elements with deletion when creating new pileups * Refactored most of the lazyLoadNextAlignment() function of the LocusIteratorByState for clarity and to create clear separation between what is a pileup with a deletion and what's not one. Got rid of many useless if statements. * Changed the way LocusIteratorByState creates extended event pileups to differentiate between insertions in the beginning of the read and deletions. * Every deletion now has an offset (start of the event) * Fixed bug when LocusITeratorByState found a read starting with insertion that happened to be a reduced read. * Separated the definitions of deletion/insertion (in the beginning of the read) in all UG annotations (and the annotator engine). * Pileup depth of coverage for a deleted base will now return the average coverage around the deletion. * Indel ReadPositionRankSum test now uses the deletion true offset from the read, changed all appropriate md5's * The extra pileup elements now properly read by the Indel mode of the UG made any subsequent call have a different random number and therefore all RankSum tests have slightly different values (in the 10^-3 range). Updated all appropriate md5s after extremely careful inspection -- Thanks Ryan! phew! --- .../datasources/providers/AllLocusView.java | 38 +- .../gatk/iterators/LocusIteratorByState.java | 429 ++--- .../annotator/BaseQualityRankSumTest.java | 8 +- .../gatk/walkers/annotator/FisherStrand.java | 4 +- .../walkers/annotator/HaplotypeScore.java | 161 +- .../gatk/walkers/annotator/RankSumTest.java | 52 +- .../walkers/annotator/ReadPosRankSumTest.java | 73 +- .../DiploidSNPGenotypeLikelihoods.java | 2 +- ...elGenotypeLikelihoodsCalculationModel.java | 140 +- ...NPGenotypeLikelihoodsCalculationModel.java | 2 +- .../pileup/AbstractReadBackedPileup.java | 522 +++--- .../pileup/ExtendedEventPileupElement.java | 112 +- .../sting/utils/pileup/PileupElement.java | 82 +- .../ReadBackedExtendedEventPileupImpl.java | 113 +- .../utils/pileup/ReadBackedPileupImpl.java | 27 +- .../sting/utils/sam/AlignmentUtils.java | 1519 +++++++++-------- .../sting/utils/sam/ArtificialSAMUtils.java | 87 +- .../org/broadinstitute/sting/BaseTest.java | 13 +- .../UnifiedGenotyperIntegrationTest.java | 8 +- .../sting/utils/sam/ReadUtilsUnitTest.java | 8 +- 20 files changed, 1788 insertions(+), 1612 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/AllLocusView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/AllLocusView.java index a6731ee18..d1a2e7519 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/AllLocusView.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/AllLocusView.java @@ -23,7 +23,7 @@ import java.util.NoSuchElementException; */ /** - * A LocusView over which the user can iterate. + * A LocusView over which the user can iterate. */ public class AllLocusView extends LocusView { @@ -47,12 +47,13 @@ public class AllLocusView extends LocusView { /** * Create a new queue of locus contexts. + * * @param provider */ - public AllLocusView(LocusShardDataProvider provider) { - super( provider ); + public AllLocusView(LocusShardDataProvider provider) { + super(provider); // Seed the state tracking members with the first possible seek position and the first possible locus context. - locusIterator = new GenomeLocusIterator(genomeLocParser,provider.getLocus()); + locusIterator = new GenomeLocusIterator(genomeLocParser, provider.getLocus()); } public boolean hasNext() { @@ -63,7 +64,7 @@ public class AllLocusView extends LocusView { public AlignmentContext next() { advance(); - if(nextPosition == null) + if (nextPosition == null) throw new NoSuchElementException("No next is available in the all locus view"); // Flag to the iterator that no data is waiting in the queue to be processed. @@ -72,7 +73,7 @@ public class AllLocusView extends LocusView { AlignmentContext currentLocus; // If actual data is present, return it. Otherwise, return empty data. - if( nextLocus != null && nextLocus.getLocation().equals(nextPosition) ) + if (nextLocus != null && nextLocus.getLocation().equals(nextPosition)) currentLocus = nextLocus; else currentLocus = createEmptyLocus(nextPosition); @@ -82,15 +83,15 @@ public class AllLocusView extends LocusView { private void advance() { // Already at the next element? Don't move forward. - if(atNextElement) + if (atNextElement) return; // Out of elements? - if(nextPosition == null && !locusIterator.hasNext()) - return; + if (nextPosition == null && !locusIterator.hasNext()) + return; // If nextLocus has been consumed, clear it out to make room for the next incoming locus. - if(nextPosition != null && nextLocus != null && !nextLocus.getLocation().isPast(nextPosition)) { + if (nextPosition != null && nextLocus != null && !nextLocus.getLocation().isPast(nextPosition)) { nextLocus = null; // Determine the next locus. The trick is that we may have more than one alignment context at the same @@ -98,9 +99,9 @@ public class AllLocusView extends LocusView { // is still at the current position, we do not increment current position and wait for next call to next() to return // that context. If we know that next context is past the current position, we are done with current // position - if(hasNextLocus()) { + if (hasNextLocus()) { nextLocus = nextLocus(); - if(nextPosition.equals(nextLocus.getLocation())) { + if (nextPosition.equals(nextLocus.getLocation())) { atNextElement = true; return; } @@ -108,7 +109,7 @@ public class AllLocusView extends LocusView { } // No elements left in queue? Clear out the position state tracker and return. - if(!locusIterator.hasNext()) { + if (!locusIterator.hasNext()) { nextPosition = null; return; } @@ -119,9 +120,9 @@ public class AllLocusView extends LocusView { // Crank the iterator to (if possible) or past the next context. Be careful not to hold a reference to nextLocus // while using the hasNextLocus() / nextLocus() machinery; this will cause us to use more memory than is optimal. - while(nextLocus == null || nextLocus.getLocation().isBefore(nextPosition)) { + while (nextLocus == null || nextLocus.getLocation().isBefore(nextPosition)) { nextLocus = null; - if(!hasNextLocus()) + if (!hasNextLocus()) break; nextLocus = nextLocus(); } @@ -129,12 +130,15 @@ public class AllLocusView extends LocusView { /** * Creates a blank locus context at the specified location. + * * @param site Site at which to create the blank locus context. * @return empty context. */ private final static List EMPTY_PILEUP_READS = Collections.emptyList(); private final static List EMPTY_PILEUP_OFFSETS = Collections.emptyList(); - private AlignmentContext createEmptyLocus( GenomeLoc site ) { - return new AlignmentContext(site,new ReadBackedPileupImpl(site, EMPTY_PILEUP_READS, EMPTY_PILEUP_OFFSETS)); + private final static List EMPTY_DELETION_STATUS = Collections.emptyList(); + + private AlignmentContext createEmptyLocus(GenomeLoc site) { + return new AlignmentContext(site, new ReadBackedPileupImpl(site, EMPTY_PILEUP_READS, EMPTY_PILEUP_OFFSETS)); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java index 75e787e05..f1ffa121b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java @@ -49,9 +49,13 @@ import org.broadinstitute.sting.utils.sam.ReadUtils; import java.util.*; -/** Iterator that traverses a SAM File, accumulating information on a per-locus basis */ +/** + * Iterator that traverses a SAM File, accumulating information on a per-locus basis + */ public class LocusIteratorByState extends LocusIterator { - /** our log, which we want to capture anything from this class */ + /** + * our log, which we want to capture anything from this class + */ private static Logger logger = Logger.getLogger(LocusIteratorByState.class); // ----------------------------------------------------------------------------------------------------------------- @@ -92,12 +96,14 @@ public class LocusIteratorByState extends LocusIterator { boolean generateExtendedEvents = true; // should we generate an additional, special pile for indels between the ref bases? // the only purpose of this flag is to shield away a few additional lines of code // when extended piles are not needed, it may not be even worth it... - byte[] insertedBases = null; // remember full inserted sequence if we are generating piles of extended events (indels) - int eventLength = -1; // will be set to the length of insertion/deletion if we are generating piles of extended events - byte eventDelayedFlag = 0; // will be set to non-0 if there was an event (indel) right before the + + byte[] insertedBases = null; // remember full inserted sequence if we are generating piles of extended events (indels) + int eventLength = -1; // will be set to the length of insertion/deletion if we are generating piles of extended events + byte eventDelayedFlag = 0; // will be set to non-0 if there was an event (indel) right before the // current base on the ref. We use a counter-like variable here since clearing the indel event is // delayed by one base, so we need to remember how long ago we have seen the actual event - int eventStart = -1; // where on the read the extended event starts (i.e. the last position on the read prior to the + + int eventStart = -1; // where on the read the extended event starts (i.e. the last position on the read prior to the // event, or -1 if alignment starts with an insertion); this one is easy to recompute on the fly, // we cache it here mainly for convenience @@ -111,23 +117,31 @@ public class LocusIteratorByState extends LocusIterator { //System.out.printf("Creating a SAMRecordState: %s%n", this); } - public SAMRecord getRead() { return read; } + public SAMRecord getRead() { + return read; + } /** * What is our current offset in the read's bases that aligns us with the reference genome? * * @return */ - public int getReadOffset() { return readOffset; } + public int getReadOffset() { + return readOffset; + } /** * What is the current offset w.r.t. the alignment state that aligns us to the readOffset? * * @return */ - public int getGenomeOffset() { return genomeOffset; } + public int getGenomeOffset() { + return genomeOffset; + } - public int getGenomePosition() { return read.getAlignmentStart() + getGenomeOffset(); } + public int getGenomePosition() { + return read.getAlignmentStart() + getGenomeOffset(); + } public GenomeLoc getLocation(GenomeLocParser genomeLocParser) { return genomeLocParser.createGenomeLoc(read.getReferenceName(), getGenomePosition()); @@ -137,19 +151,26 @@ public class LocusIteratorByState extends LocusIterator { return curElement.getOperator(); } - /** Returns true if we just stepped over insertion/into a deletion prior to the last return from stepForwardOnGenome. + /** + * Returns true if we just stepped over insertion/into a deletion prior to the last return from stepForwardOnGenome. * * @return */ public boolean hadIndel() { - return ( eventLength > 0 ); + return (eventLength > 0); } - public int getEventLength() { return eventLength; } + public int getEventLength() { + return eventLength; + } - public byte[] getEventBases() { return insertedBases; } + public byte[] getEventBases() { + return insertedBases; + } - public int getReadEventStartOffset() { return eventStart; } + public int getReadEventStartOffset() { + return eventStart; + } public String toString() { return String.format("%s ro=%d go=%d co=%d cec=%d %s", read.getReadName(), readOffset, genomeOffset, cigarOffset, cigarElementCounter, curElement); @@ -160,9 +181,9 @@ public class LocusIteratorByState extends LocusIterator { // (-1 if we did not process a single base yet); this can be last matching base, or last base of an insertion - if ( curElement == null || ++cigarElementCounter > curElement.getLength() ) { + if (curElement == null || ++cigarElementCounter > curElement.getLength()) { cigarOffset++; - if ( cigarOffset < nCigarElements ) { + if (cigarOffset < nCigarElements) { curElement = cigar.getCigarElement(cigarOffset); cigarElementCounter = 0; // next line: guards against cigar elements of length 0; when new cigar element is retrieved, @@ -174,15 +195,15 @@ public class LocusIteratorByState extends LocusIterator { // current offset of this read is the next ref base after the end of the indel. This position will // model a point on the reference somewhere after the end of the read. genomeOffset++; // extended events need that. Logically, it's legal to advance the genomic offset here: - // we do step forward on the ref, and by returning null we also indicate that we are past the read end. + // we do step forward on the ref, and by returning null we also indicate that we are past the read end. - if ( generateExtendedEvents && eventDelayedFlag > 0 ) { + if (generateExtendedEvents && eventDelayedFlag > 0) { // if we had an indel right before the read ended (i.e. insertion was the last cigar element), // we keep it until next reference base; then we discard it and this will allow the LocusIterator to // finally discard this read eventDelayedFlag--; - if ( eventDelayedFlag == 0 ) { + if (eventDelayedFlag == 0) { eventLength = -1; // reset event when we are past it insertedBases = null; eventStart = -1; @@ -193,34 +214,35 @@ public class LocusIteratorByState extends LocusIterator { } } - boolean done = false; switch (curElement.getOperator()) { - case H : // ignore hard clips - case P : // ignore pads + case H: // ignore hard clips + case P: // ignore pads cigarElementCounter = curElement.getLength(); break; - case I : // insertion w.r.t. the reference - if ( generateExtendedEvents ) { + case I: // insertion w.r.t. the reference + if (generateExtendedEvents) { // we see insertions only once, when we step right onto them; the position on the read is scrolled // past the insertion right after that - if ( eventDelayedFlag > 1 ) throw new UserException.MalformedBAM(read, "Adjacent I/D events in read "+read.getReadName()); - insertedBases = Arrays.copyOfRange(read.getReadBases(),readOffset+1,readOffset+1+curElement.getLength()); - eventLength = curElement.getLength() ; + if (eventDelayedFlag > 1) + throw new UserException.MalformedBAM(read, "Adjacent I/D events in read " + read.getReadName()); + insertedBases = Arrays.copyOfRange(read.getReadBases(), readOffset + 1, readOffset + 1 + curElement.getLength()); + eventLength = curElement.getLength(); eventStart = readOffset; eventDelayedFlag = 2; // insertion causes re-entry into stepForwardOnGenome, so we set the delay to 2 // System.out.println("Inserted "+(new String (insertedBases)) +" after "+readOffset); } // continue onto the 'S' case ! - case S : // soft clip + case S: // soft clip cigarElementCounter = curElement.getLength(); readOffset += curElement.getLength(); break; - case D : // deletion w.r.t. the reference - if ( generateExtendedEvents ) { - if ( cigarElementCounter == 1) { + case D: // deletion w.r.t. the reference + if (generateExtendedEvents) { + if (cigarElementCounter == 1) { // generate an extended event only if we just stepped into the deletion (i.e. don't // generate the event at every deleted position on the ref, that's what cigarElementCounter==1 is for!) - if ( eventDelayedFlag > 1 ) throw new UserException.MalformedBAM(read, "Adjacent I/D events in read "+read.getReadName()); + if (eventDelayedFlag > 1) + throw new UserException.MalformedBAM(read, "Adjacent I/D events in read " + read.getReadName()); eventLength = curElement.getLength(); eventDelayedFlag = 2; // deletion on the ref causes an immediate return, so we have to delay by 1 only eventStart = readOffset; @@ -232,26 +254,27 @@ public class LocusIteratorByState extends LocusIterator { genomeOffset++; done = true; break; - case N : // reference skip (looks and gets processed just like a "deletion", just different logical meaning) + case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning) genomeOffset++; done = true; break; - case M : + case M: readOffset++; genomeOffset++; done = true; break; - default : throw new IllegalStateException("Case statement didn't deal with cigar op: " + curElement.getOperator()); + default: + throw new IllegalStateException("Case statement didn't deal with cigar op: " + curElement.getOperator()); } - if ( generateExtendedEvents ) { - if ( eventDelayedFlag > 0 && done ) { - // if we did make a successful step on the ref, decrement delayed flag. If, upon the decrementthe, + if (generateExtendedEvents) { + if (eventDelayedFlag > 0 && done) { + // if we did make a successful step on the ref, decrement delayed flag. If, upon the decrementing the, // the flag is 1, we are standing on the reference base right after the indel (so we have to keep it). // Otherwise, we are away from the previous indel and have to clear our memories... eventDelayedFlag--; // when we notice an indel, we set delayed flag to 2, so now - // if eventDelayedFlag == 1, an indel occured right before the current base - if ( eventDelayedFlag == 0 ) { + // if eventDelayedFlag == 1, an indel occured right before the current base + if (eventDelayedFlag == 0) { eventLength = -1; // reset event when we are past it insertedBases = null; eventStart = -1; @@ -274,15 +297,15 @@ public class LocusIteratorByState extends LocusIterator { // // ----------------------------------------------------------------------------------------------------------------- - public LocusIteratorByState(final Iterator samIterator, ReadProperties readInformation, GenomeLocParser genomeLocParser, Collection samples ) { + public LocusIteratorByState(final Iterator samIterator, ReadProperties readInformation, GenomeLocParser genomeLocParser, Collection samples) { this.readInfo = readInformation; this.genomeLocParser = genomeLocParser; this.samples = new ArrayList(samples); - this.readStates = new ReadStateManager(samIterator,readInformation.getDownsamplingMethod()); + this.readStates = new ReadStateManager(samIterator, readInformation.getDownsamplingMethod()); // currently the GATK expects this LocusIteratorByState to accept empty sample lists, when // there's no read data. So we need to throw this error only when samIterator.hasNext() is true - if ( this.samples.isEmpty() && samIterator.hasNext() ) { + if (this.samples.isEmpty() && samIterator.hasNext()) { throw new IllegalArgumentException("samples list must not be empty"); } } @@ -322,7 +345,7 @@ public class LocusIteratorByState extends LocusIterator { // ----------------------------------------------------------------------------------------------------------------- public AlignmentContext next() { lazyLoadNextAlignmentContext(); - if(!hasNext()) + if (!hasNext()) throw new NoSuchElementException("LocusIteratorByState: out of elements."); AlignmentContext currentAlignmentContext = nextAlignmentContext; nextAlignmentContext = null; @@ -334,7 +357,7 @@ public class LocusIteratorByState extends LocusIterator { * nextAlignmentContext MUST BE null in order for this method to advance to the next entry. */ private void lazyLoadNextAlignmentContext() { - while(nextAlignmentContext == null && readStates.hasNext()) { + while (nextAlignmentContext == null && readStates.hasNext()) { // this call will set hasExtendedEvents to true if it picks up a read with indel right before the current position on the ref: readStates.collectPendingReads(); @@ -350,14 +373,14 @@ public class LocusIteratorByState extends LocusIterator { // In this case, the subsequent call to next() will emit the normal pileup at the current base // and shift the position. if (readInfo.generateExtendedEvents() && hasExtendedEvents) { - Map fullExtendedEventPileup = new HashMap(); + Map fullExtendedEventPileup = new HashMap(); // get current location on the reference and decrement it by 1: the indels we just stepped over // are associated with the *previous* reference base - GenomeLoc loc = genomeLocParser.incPos(getLocation(),-1); + GenomeLoc loc = genomeLocParser.incPos(getLocation(), -1); boolean hasBeenSampled = false; - for(final String sample: samples) { + for (final String sample : samples) { Iterator iterator = readStates.iterator(sample); List indelPile = new ArrayList(readStates.size(sample)); hasBeenSampled |= loc.getStart() <= readStates.getDownsamplingExtent(sample); @@ -368,103 +391,108 @@ public class LocusIteratorByState extends LocusIterator { nMQ0Reads = 0; int maxDeletionLength = 0; - while(iterator.hasNext()) { - SAMRecordState state = iterator.next(); - if ( state.hadIndel() ) { + while (iterator.hasNext()) { + final SAMRecordState state = iterator.next(); + final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read + final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator + final int readOffset = state.getReadOffset(); // the base offset on this read + final int eventStartOffset = state.getReadEventStartOffset(); // this will be -1 if base is not a deletion, or if base is the first deletion in the event. Otherwise, it will give the last base before the deletion began. + final int eventLength = state.getEventLength(); + +// if (op != CigarOperator.N) // N's are never added to any pileup +// continue; +// + if (state.hadIndel()) { // this read has an indel associated with the previous position on the ref size++; - if ( state.getEventBases() == null ) { + ExtendedEventPileupElement pileupElement; + if (state.getEventBases() == null) { // Deletion event nDeletions++; - maxDeletionLength = Math.max(maxDeletionLength,state.getEventLength()); + maxDeletionLength = Math.max(maxDeletionLength, state.getEventLength()); + pileupElement = new ExtendedEventPileupElement(read, eventStartOffset, eventLength); + } + else { // Insertion event + nInsertions++; + pileupElement = new ExtendedEventPileupElement(read, eventStartOffset, eventLength, state.getEventBases()); } - else nInsertions++; - indelPile.add ( new ExtendedEventPileupElement((GATKSAMRecord) state.getRead(), state.getReadEventStartOffset(), state.getEventLength(), state.getEventBases()) ); - } else { - // HACK: The readahead mechanism for LocusIteratorByState will effectively read past the current position - // and add in extra reads that start after this indel. Skip these reads. - // My belief at this moment after empirically looking at read->ref alignment is that, in a cigar string - // like 1I76M, the first insertion is between alignment start-1 and alignment start, so we shouldn't be - // filtering these out. - // TODO: UPDATE! Eric tells me that we *might* want reads adjacent to the pileup in the pileup. Strike this block. - //if(state.getRead().getAlignmentStart() > loc.getStart()) - // continue; - - if ( state.getCurrentCigarOperator() != CigarOperator.N ) { - // this read has no indel associated with the previous position on the ref; - // we count this read in only if it has actual bases, not N span... - if ( state.getCurrentCigarOperator() != CigarOperator.D || readInfo.includeReadsWithDeletionAtLoci() ) { - - // if cigar operator is D but the read has no extended event reported (that's why we ended - // up in this branch), it means that we are currently inside a deletion that started earlier; - // we count such reads (with a longer deletion spanning over a deletion at the previous base we are - // about to report) only if includeReadsWithDeletionAtLoci is true. - size++; - indelPile.add ( new ExtendedEventPileupElement((GATKSAMRecord) state.getRead(), state.getReadOffset()-1, -1) // length=-1 --> noevent - ); - } - } + indelPile.add(pileupElement); } - if ( state.getRead().getMappingQuality() == 0 ) { + + // this read has no indel associated with the previous position on the ref. Criteria to include in the pileup are: + // we only add reads that are not N's + // we only include deletions to the pileup if the walker requests it + else if ( (op != CigarOperator.N) && (op != CigarOperator.D || readInfo.includeReadsWithDeletionAtLoci())) { + size++; + indelPile.add(new ExtendedEventPileupElement((GATKSAMRecord) state.getRead(), readOffset)); + } + + + if (state.getRead().getMappingQuality() == 0) nMQ0Reads++; - } - } - if( indelPile.size() != 0 ) fullExtendedEventPileup.put(sample,new ReadBackedExtendedEventPileupImpl(loc,indelPile,size,maxDeletionLength,nInsertions,nDeletions,nMQ0Reads)); - } - hasExtendedEvents = false; // we are done with extended events prior to current ref base -// System.out.println("Indel(s) at "+loc); -// for ( ExtendedEventPileupElement pe : indelPile ) { if ( pe.isIndel() ) System.out.println(" "+pe.toString()); } - nextAlignmentContext = new AlignmentContext(loc, new ReadBackedExtendedEventPileupImpl(loc, fullExtendedEventPileup), hasBeenSampled); - } else { - GenomeLoc location = getLocation(); - Map fullPileup = new HashMap(); + } + + if (indelPile.size() != 0) + fullExtendedEventPileup.put(sample, new ReadBackedExtendedEventPileupImpl(loc, indelPile, size, maxDeletionLength, nInsertions, nDeletions, nMQ0Reads)); + } + hasExtendedEvents = false; // we are done with extended events prior to current ref base + nextAlignmentContext = new AlignmentContext(loc, new ReadBackedExtendedEventPileupImpl(loc, fullExtendedEventPileup), hasBeenSampled); + } + else { // this is a regular event pileup (not extended) + GenomeLoc location = getLocation(); + Map fullPileup = new HashMap(); boolean hasBeenSampled = false; - for(final String sample: samples) { + for (final String sample : samples) { Iterator iterator = readStates.iterator(sample); List pile = new ArrayList(readStates.size(sample)); hasBeenSampled |= location.getStart() <= readStates.getDownsamplingExtent(sample); - size = 0; - nDeletions = 0; - nMQ0Reads = 0; + size = 0; // number of elements in this sample's pileup + nDeletions = 0; // number of deletions in this sample's pileup + nMQ0Reads = 0; // number of MQ0 reads in this sample's pileup (warning: current implementation includes N bases that are MQ0) - while(iterator.hasNext()) { - SAMRecordState state = iterator.next(); - if ( state.getCurrentCigarOperator() != CigarOperator.D && state.getCurrentCigarOperator() != CigarOperator.N ) { - if ( filterBaseInRead((GATKSAMRecord) state.getRead(), location.getStart()) ) { - //discarded_bases++; - //printStatus("Adaptor bases", discarded_adaptor_bases); - continue; - } else { - //observed_bases++; - pile.add(new PileupElement((GATKSAMRecord) state.getRead(), state.getReadOffset())); + while (iterator.hasNext()) { + final SAMRecordState state = iterator.next(); // state object with the read/offset information + final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read + final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator + final int readOffset = state.getReadOffset(); // the base offset on this read + final int eventStartOffset = state.getReadEventStartOffset(); // this will be -1 if base is not a deletion, or if base is the first deletion in the event. Otherwise, it will give the last base before the deletion began. + + if (op == CigarOperator.N) // N's are never added to any pileup + continue; + + if (read.getMappingQuality() == 0) + nMQ0Reads++; + + if (op == CigarOperator.D) { + if (readInfo.includeReadsWithDeletionAtLoci()) { // only add deletions to the pileup if we are authorized to do so + int leftAlignedStart = (eventStartOffset < 0) ? readOffset : eventStartOffset; + pile.add(new PileupElement(read, leftAlignedStart, true)); + size++; + nDeletions++; + } + } else { + if (!filterBaseInRead(read, location.getStart())) { + pile.add(new PileupElement(read, readOffset, false)); size++; } - } else if ( readInfo.includeReadsWithDeletionAtLoci() && state.getCurrentCigarOperator() != CigarOperator.N ) { - size++; - pile.add(new PileupElement((GATKSAMRecord) state.getRead(), -1)); - nDeletions++; - } - - if ( state.getRead().getMappingQuality() == 0 ) { - nMQ0Reads++; } } - if( pile.size() != 0 ) - fullPileup.put(sample,new ReadBackedPileupImpl(location,pile,size,nDeletions,nMQ0Reads)); + if (pile.size() != 0) // if this pileup added at least one base, add it to the full pileup + fullPileup.put(sample, new ReadBackedPileupImpl(location, pile, size, nDeletions, nMQ0Reads)); } - updateReadStates(); // critical - must be called after we get the current state offsets and location - // if we got reads with non-D/N over the current position, we are done - if ( !fullPileup.isEmpty() ) nextAlignmentContext = new AlignmentContext(location, new ReadBackedPileupImpl(location,fullPileup),hasBeenSampled); + updateReadStates(); // critical - must be called after we get the current state offsets and location + if (!fullPileup.isEmpty()) // if we got reads with non-D/N over the current position, we are done + nextAlignmentContext = new AlignmentContext(location, new ReadBackedPileupImpl(location, fullPileup), hasBeenSampled); } } } // fast testing of position private boolean readIsPastCurrentPosition(SAMRecord read) { - if ( readStates.isEmpty() ) + if (readStates.isEmpty()) return false; else { SAMRecordState state = readStates.getFirst(); @@ -485,20 +513,18 @@ public class LocusIteratorByState extends LocusIterator { } private void updateReadStates() { - for(final String sample: samples) { + for (final String sample : samples) { Iterator it = readStates.iterator(sample); - while ( it.hasNext() ) { + while (it.hasNext()) { SAMRecordState state = it.next(); CigarOperator op = state.stepForwardOnGenome(); - if ( state.hadIndel() && readInfo.generateExtendedEvents() ) hasExtendedEvents = true; - else { + if (state.hadIndel() && readInfo.generateExtendedEvents()) + hasExtendedEvents = true; + else if (op == null) { // we discard the read only when we are past its end AND indel at the end of the read (if any) was // already processed. Keeping the read state that retunred null upon stepForwardOnGenome() is safe // as the next call to stepForwardOnGenome() will return null again AND will clear hadIndel() flag. - if ( op == null ) { // we've stepped off the end of the object - //if (DEBUG) logger.debug(String.format(" removing read %s at %d", state.getRead().getReadName(), state.getRead().getAlignmentStart())); - it.remove(); - } + it.remove(); // we've stepped off the end of the object } } } @@ -508,20 +534,20 @@ public class LocusIteratorByState extends LocusIterator { throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); } - private class ReadStateManager { + private class ReadStateManager { private final PeekableIterator iterator; private final DownsamplingMethod downsamplingMethod; private final SamplePartitioner samplePartitioner; - private final Map readStatesBySample = new HashMap(); + private final Map readStatesBySample = new HashMap(); private final int targetCoverage; private int totalReadStates = 0; public ReadStateManager(Iterator source, DownsamplingMethod downsamplingMethod) { this.iterator = new PeekableIterator(source); this.downsamplingMethod = downsamplingMethod.type != null ? downsamplingMethod : DownsamplingMethod.NONE; - switch(this.downsamplingMethod.type) { + switch (this.downsamplingMethod.type) { case BY_SAMPLE: - if(downsamplingMethod.toCoverage == null) + if (downsamplingMethod.toCoverage == null) throw new UserException.BadArgumentValue("dcov", "Downsampling coverage (-dcov) must be specified when downsampling by sample"); this.targetCoverage = downsamplingMethod.toCoverage; break; @@ -529,10 +555,10 @@ public class LocusIteratorByState extends LocusIterator { this.targetCoverage = Integer.MAX_VALUE; } - Map readSelectors = new HashMap(); - for(final String sample: samples) { - readStatesBySample.put(sample,new PerSampleReadStateManager()); - readSelectors.put(sample,downsamplingMethod.type == DownsampleType.BY_SAMPLE ? new NRandomReadSelector(null,targetCoverage) : new AllReadsSelector()); + Map readSelectors = new HashMap(); + for (final String sample : samples) { + readStatesBySample.put(sample, new PerSampleReadStateManager()); + readSelectors.put(sample, downsamplingMethod.type == DownsampleType.BY_SAMPLE ? new NRandomReadSelector(null, targetCoverage) : new AllReadsSelector()); } samplePartitioner = new SamplePartitioner(readSelectors); @@ -541,6 +567,7 @@ public class LocusIteratorByState extends LocusIterator { /** * Returns a iterator over all the reads associated with the given sample. Note that remove() is implemented * for this iterator; if present, total read states will be decremented. + * * @param sample The sample. * @return Iterator over the reads associated with that sample. */ @@ -569,6 +596,7 @@ public class LocusIteratorByState extends LocusIterator { /** * Retrieves the total number of reads in the manager across all samples. + * * @return Total number of reads over all samples. */ public int size() { @@ -577,6 +605,7 @@ public class LocusIteratorByState extends LocusIterator { /** * Retrieves the total number of reads in the manager in the given sample. + * * @param sample The sample. * @return Total number of reads in the given sample. */ @@ -587,6 +616,7 @@ public class LocusIteratorByState extends LocusIterator { /** * The extent of downsampling; basically, the furthest base out which has 'fallen * victim' to the downsampler. + * * @param sample Sample, downsampled independently. * @return Integer stop of the furthest undownsampled region. */ @@ -595,9 +625,9 @@ public class LocusIteratorByState extends LocusIterator { } public SAMRecordState getFirst() { - for(final String sample: samples) { + for (final String sample : samples) { PerSampleReadStateManager reads = readStatesBySample.get(sample); - if(!reads.isEmpty()) + if (!reads.isEmpty()) return reads.peek(); } return null; @@ -608,19 +638,18 @@ public class LocusIteratorByState extends LocusIterator { } public void collectPendingReads() { - if(!iterator.hasNext()) + if (!iterator.hasNext()) return; - if(readStates.size() == 0) { + if (readStates.size() == 0) { int firstContigIndex = iterator.peek().getReferenceIndex(); int firstAlignmentStart = iterator.peek().getAlignmentStart(); - while(iterator.hasNext() && iterator.peek().getReferenceIndex() == firstContigIndex && iterator.peek().getAlignmentStart() == firstAlignmentStart) { + while (iterator.hasNext() && iterator.peek().getReferenceIndex() == firstContigIndex && iterator.peek().getAlignmentStart() == firstAlignmentStart) { samplePartitioner.submitRead(iterator.next()); } - } - else { + } else { // Fast fail in the case that the read is past the current position. - if(readIsPastCurrentPosition(iterator.peek())) + if (readIsPastCurrentPosition(iterator.peek())) return; while (iterator.hasNext() && !readIsPastCurrentPosition(iterator.peek())) { @@ -629,7 +658,7 @@ public class LocusIteratorByState extends LocusIterator { } samplePartitioner.complete(); - for(final String sample: samples) { + for (final String sample : samples) { ReadSelector aggregator = samplePartitioner.getSelectedReads(sample); Collection newReads = new ArrayList(aggregator.getSelectedReads()); @@ -638,21 +667,20 @@ public class LocusIteratorByState extends LocusIterator { int numReads = statesBySample.size(); int downsamplingExtent = aggregator.getDownsamplingExtent(); - if(numReads+newReads.size()<=targetCoverage || downsamplingMethod.type==DownsampleType.NONE) { + if (numReads + newReads.size() <= targetCoverage || downsamplingMethod.type == DownsampleType.NONE) { long readLimit = aggregator.getNumReadsSeen(); - addReadsToSample(statesBySample,newReads,readLimit); + addReadsToSample(statesBySample, newReads, readLimit); statesBySample.specifyNewDownsamplingExtent(downsamplingExtent); - } - else { + } else { int[] counts = statesBySample.getCountsPerAlignmentStart(); int[] updatedCounts = new int[counts.length]; - System.arraycopy(counts,0,updatedCounts,0,counts.length); + System.arraycopy(counts, 0, updatedCounts, 0, counts.length); boolean readPruned = true; - while(numReads+newReads.size()>targetCoverage && readPruned) { + while (numReads + newReads.size() > targetCoverage && readPruned) { readPruned = false; - for(int alignmentStart=updatedCounts.length-1;numReads+newReads.size()>targetCoverage&&alignmentStart>=0;alignmentStart--) { - if(updatedCounts[alignmentStart] > 1) { + for (int alignmentStart = updatedCounts.length - 1; numReads + newReads.size() > targetCoverage && alignmentStart >= 0; alignmentStart--) { + if (updatedCounts[alignmentStart] > 1) { updatedCounts[alignmentStart]--; numReads--; readPruned = true; @@ -660,7 +688,7 @@ public class LocusIteratorByState extends LocusIterator { } } - if(numReads == targetCoverage) { + if (numReads == targetCoverage) { updatedCounts[0]--; numReads--; } @@ -668,18 +696,18 @@ public class LocusIteratorByState extends LocusIterator { BitSet toPurge = new BitSet(readStates.size()); int readOffset = 0; - for(int i = 0; i < updatedCounts.length; i++) { + for (int i = 0; i < updatedCounts.length; i++) { int n = counts[i]; int k = updatedCounts[i]; - for(Integer purgedElement: MathUtils.sampleIndicesWithoutReplacement(n,n-k)) - toPurge.set(readOffset+purgedElement); + for (Integer purgedElement : MathUtils.sampleIndicesWithoutReplacement(n, n - k)) + toPurge.set(readOffset + purgedElement); readOffset += counts[i]; } - downsamplingExtent = Math.max(downsamplingExtent,statesBySample.purge(toPurge)); - - addReadsToSample(statesBySample,newReads,targetCoverage-numReads); + downsamplingExtent = Math.max(downsamplingExtent, statesBySample.purge(toPurge)); + + addReadsToSample(statesBySample, newReads, targetCoverage - numReads); statesBySample.specifyNewDownsamplingExtent(downsamplingExtent); } } @@ -688,23 +716,25 @@ public class LocusIteratorByState extends LocusIterator { /** * Add reads with the given sample name to the given hanger entry. + * * @param readStates The list of read states to add this collection of reads. - * @param reads Reads to add. Selected reads will be pulled from this source. - * @param maxReads Maximum number of reads to add. + * @param reads Reads to add. Selected reads will be pulled from this source. + * @param maxReads Maximum number of reads to add. */ private void addReadsToSample(final PerSampleReadStateManager readStates, final Collection reads, final long maxReads) { - if(reads.isEmpty()) + if (reads.isEmpty()) return; Collection newReadStates = new LinkedList(); int readCount = 0; - for(SAMRecord read: reads) { - if(readCount < maxReads) { + for (SAMRecord read : reads) { + if (readCount < maxReads) { SAMRecordState state = new SAMRecordState(read, readInfo.generateExtendedEvents()); state.stepForwardOnGenome(); newReadStates.add(state); // TODO: What if we downsample the extended events away? - if (state.hadIndel()) hasExtendedEvents = true; + if (state.hadIndel()) + hasExtendedEvents = true; readCount++; } } @@ -735,7 +765,7 @@ public class LocusIteratorByState extends LocusIterator { } public void specifyNewDownsamplingExtent(int downsamplingExtent) { - this.downsamplingExtent = Math.max(this.downsamplingExtent,downsamplingExtent); + this.downsamplingExtent = Math.max(this.downsamplingExtent, downsamplingExtent); } public int getDownsamplingExtent() { @@ -745,7 +775,7 @@ public class LocusIteratorByState extends LocusIterator { public int[] getCountsPerAlignmentStart() { int[] counts = new int[readStateCounter.size()]; int index = 0; - for(Counter counter: readStateCounter) + for (Counter counter : readStateCounter) counts[index++] = counter.getCount(); return counts; } @@ -766,7 +796,7 @@ public class LocusIteratorByState extends LocusIterator { wrappedIterator.remove(); Counter counter = readStateCounter.peek(); counter.decrement(); - if(counter.getCount() == 0) + if (counter.getCount() == 0) readStateCounter.remove(); } }; @@ -775,13 +805,14 @@ public class LocusIteratorByState extends LocusIterator { /** * Purge the given elements from the bitset. If an element in the bitset is true, purge * the corresponding read state. + * * @param elements bits from the set to purge. * @return the extent of the final downsampled read. */ public int purge(final BitSet elements) { int downsamplingExtent = 0; - if(elements.isEmpty() || readStates.isEmpty()) return downsamplingExtent; + if (elements.isEmpty() || readStates.isEmpty()) return downsamplingExtent; Iterator readStateIterator = readStates.iterator(); @@ -794,22 +825,22 @@ public class LocusIteratorByState extends LocusIterator { int toPurge = elements.nextSetBit(0); int removedCount = 0; - while(readStateIterator.hasNext() && toPurge >= 0) { + while (readStateIterator.hasNext() && toPurge >= 0) { SAMRecordState state = readStateIterator.next(); - downsamplingExtent = Math.max(downsamplingExtent,state.getRead().getAlignmentEnd()); + downsamplingExtent = Math.max(downsamplingExtent, state.getRead().getAlignmentEnd()); - if(readIndex == toPurge) { + if (readIndex == toPurge) { readStateIterator.remove(); currentCounter.decrement(); - if(currentCounter.getCount() == 0) + if (currentCounter.getCount() == 0) counterIterator.remove(); removedCount++; - toPurge = elements.nextSetBit(toPurge+1); + toPurge = elements.nextSetBit(toPurge + 1); } readIndex++; alignmentStartCounter--; - if(alignmentStartCounter == 0 && counterIterator.hasNext()) { + if (alignmentStartCounter == 0 && counterIterator.hasNext()) { currentCounter = counterIterator.next(); alignmentStartCounter = currentCounter.getCount(); } @@ -849,12 +880,14 @@ public class LocusIteratorByState extends LocusIterator { interface ReadSelector { /** * All previous selectors in the chain have allowed this read. Submit it to this selector for consideration. + * * @param read the read to evaluate. */ public void submitRead(SAMRecord read); /** * A previous selector has deemed this read unfit. Notify this selector so that this selector's counts are valid. + * * @param read the read previously rejected. */ public void notifyReadRejected(SAMRecord read); @@ -866,12 +899,14 @@ interface ReadSelector { /** * Retrieve the number of reads seen by this selector so far. + * * @return number of reads seen. */ public long getNumReadsSeen(); /** * Return the number of reads accepted by this selector so far. + * * @return number of reads selected. */ public long getNumReadsSelected(); @@ -880,12 +915,14 @@ interface ReadSelector { * Gets the locus at which the last of the downsampled reads selected by this selector ends. The value returned will be the * last aligned position from this selection to which a downsampled read aligns -- in other words, if a read is thrown out at * position 3 whose cigar string is 76M, the value of this parameter will be 78. + * * @return If any read has been downsampled, this will return the last aligned base of the longest alignment. Else, 0. */ public int getDownsamplingExtent(); /** * Get the reads selected by this selector. + * * @return collection of reads selected by this selector. */ public Collection getSelectedReads(); @@ -911,7 +948,7 @@ class AllReadsSelector implements ReadSelector { public void notifyReadRejected(SAMRecord read) { readsSeen++; - downsamplingExtent = Math.max(downsamplingExtent,read.getAlignmentEnd()); + downsamplingExtent = Math.max(downsamplingExtent, read.getAlignmentEnd()); } public void complete() { @@ -949,18 +986,18 @@ class NRandomReadSelector implements ReadSelector { private final ReservoirDownsampler reservoir; private final ReadSelector chainedSelector; private long readsSeen = 0; - private int downsamplingExtent = 0; + private int downsamplingExtent = 0; public NRandomReadSelector(ReadSelector chainedSelector, long readLimit) { - this.reservoir = new ReservoirDownsampler((int)readLimit); + this.reservoir = new ReservoirDownsampler((int) readLimit); this.chainedSelector = chainedSelector; } public void submitRead(SAMRecord read) { SAMRecord displaced = reservoir.add(read); - if(displaced != null && chainedSelector != null) { + if (displaced != null && chainedSelector != null) { chainedSelector.notifyReadRejected(read); - downsamplingExtent = Math.max(downsamplingExtent,read.getAlignmentEnd()); + downsamplingExtent = Math.max(downsamplingExtent, read.getAlignmentEnd()); } readsSeen++; } @@ -970,9 +1007,9 @@ class NRandomReadSelector implements ReadSelector { } public void complete() { - for(SAMRecord read: reservoir.getDownsampledContents()) + for (SAMRecord read : reservoir.getDownsampledContents()) chainedSelector.submitRead(read); - if(chainedSelector != null) + if (chainedSelector != null) chainedSelector.complete(); } @@ -987,7 +1024,7 @@ class NRandomReadSelector implements ReadSelector { public int getDownsamplingExtent() { return downsamplingExtent; - } + } public Collection getSelectedReads() { return reservoir.getDownsampledContents(); @@ -996,7 +1033,7 @@ class NRandomReadSelector implements ReadSelector { public void reset() { reservoir.clear(); downsamplingExtent = 0; - if(chainedSelector != null) + if (chainedSelector != null) chainedSelector.reset(); } } @@ -1005,23 +1042,23 @@ class NRandomReadSelector implements ReadSelector { * Note: stores reads by sample ID string, not by sample object */ class SamplePartitioner implements ReadSelector { - private final Map readsBySample; + private final Map readsBySample; private long readsSeen = 0; - public SamplePartitioner(Map readSelectors) { + public SamplePartitioner(Map readSelectors) { readsBySample = readSelectors; } public void submitRead(SAMRecord read) { - String sampleName = read.getReadGroup()!=null ? read.getReadGroup().getSample() : null; - if(readsBySample.containsKey(sampleName)) + String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; + if (readsBySample.containsKey(sampleName)) readsBySample.get(sampleName).submitRead(read); readsSeen++; } public void notifyReadRejected(SAMRecord read) { - String sampleName = read.getReadGroup()!=null ? read.getReadGroup().getSample() : null; - if(readsBySample.containsKey(sampleName)) + String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; + if (readsBySample.containsKey(sampleName)) readsBySample.get(sampleName).notifyReadRejected(read); readsSeen++; } @@ -1040,23 +1077,23 @@ class SamplePartitioner implements ReadSelector { public int getDownsamplingExtent() { int downsamplingExtent = 0; - for(ReadSelector storage: readsBySample.values()) - downsamplingExtent = Math.max(downsamplingExtent,storage.getDownsamplingExtent()); + for (ReadSelector storage : readsBySample.values()) + downsamplingExtent = Math.max(downsamplingExtent, storage.getDownsamplingExtent()); return downsamplingExtent; } - + public Collection getSelectedReads() { throw new UnsupportedOperationException("Cannot directly get selected reads from a read partitioner."); } public ReadSelector getSelectedReads(String sampleName) { - if(!readsBySample.containsKey(sampleName)) + if (!readsBySample.containsKey(sampleName)) throw new NoSuchElementException("Sample name not found"); return readsBySample.get(sampleName); } public void reset() { - for(ReadSelector storage: readsBySample.values()) + for (ReadSelector storage : readsBySample.values()) storage.reset(); readsSeen = 0; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java index 312b505ec..507a6559c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java @@ -25,13 +25,13 @@ public class BaseQualityRankSumTest extends RankSumTest { protected void fillQualsFromPileup(byte ref, byte alt, ReadBackedPileup pileup, List refQuals, List altQuals) { for ( final PileupElement p : pileup ) { if( isUsableBase(p) ) { - if ( p.getBase() == ref ) { + if ( p.getBase() == ref ) refQuals.add((double)p.getQual()); - } else if ( p.getBase() == alt ) { + else if ( p.getBase() == alt ) altQuals.add((double)p.getQual()); - } } } + } protected void fillIndelQualsFromPileup(ReadBackedPileup pileup, List refQuals, List altQuals) { // equivalent is whether indel likelihoods for reads corresponding to ref allele are more likely than reads corresponding to alt allele ? @@ -57,8 +57,6 @@ public class BaseQualityRankSumTest extends RankSumTest { refQuals.add(-10.0*refLikelihood); else if (altLikelihood > refLikelihood + INDEL_LIKELIHOOD_THRESH) altQuals.add(-10.0*altLikelihood); - - } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java index 0dda02421..987579ab8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java @@ -205,7 +205,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat for ( Map.Entry sample : stratifiedContexts.entrySet() ) { for (PileupElement p : sample.getValue().getBasePileup()) { - if ( p.isDeletion() || p.isReducedRead() ) // ignore deletions and reduced reads + if ( p.isDeletion() || p.getRead().isReducedRead() ) // ignore deletions and reduced reads continue; if ( p.getRead().getMappingQuality() < 20 || p.getQual() < 20 ) @@ -258,7 +258,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat continue; for (final PileupElement p: pileup) { - if ( p.isReducedRead() ) // ignore reduced reads + if ( p.getRead().isReducedRead() ) // ignore reduced reads continue; if ( p.getRead().getMappingQuality() < 20) continue; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java index 551f8e2cf..40b5aa4d5 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java @@ -24,7 +24,6 @@ package org.broadinstitute.sting.gatk.walkers.annotator; -import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; @@ -43,6 +42,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -62,15 +62,15 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot private final static char REGEXP_WILDCARD = '.'; public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { - if (stratifiedContexts.size() == 0 ) // size 0 means that call was made by someone else and we have no data here + if (stratifiedContexts.size() == 0) // size 0 means that call was made by someone else and we have no data here return null; - if (vc.isSNP() && !vc.isBiallelic()) + if (vc.isSNP() && !vc.isBiallelic()) return null; final AlignmentContext context = AlignmentContextUtils.joinContexts(stratifiedContexts.values()); - final int contextWingSize = Math.min(((int)ref.getWindow().size() - 1)/2, MIN_CONTEXT_WING_SIZE); + final int contextWingSize = Math.min(((int) ref.getWindow().size() - 1) / 2, MIN_CONTEXT_WING_SIZE); final int contextSize = contextWingSize * 2 + 1; final int locus = ref.getLocus().getStart() + (ref.getLocus().getStop() - ref.getLocus().getStart()) / 2; @@ -84,14 +84,14 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot if (pileup == null) return null; - + final List haplotypes = computeHaplotypes(pileup, contextSize, locus, vc); - final MathUtils.RunningAverage scoreRA = new MathUtils.RunningAverage(); + final MathUtils.RunningAverage scoreRA = new MathUtils.RunningAverage(); if (haplotypes != null) { - for ( final Genotype genotype : vc.getGenotypes()) { + for (final Genotype genotype : vc.getGenotypes()) { final AlignmentContext thisContext = stratifiedContexts.get(genotype.getSampleName()); - if ( thisContext != null ) { + if (thisContext != null) { final ReadBackedPileup thisPileup; if (thisContext.hasExtendedEventPileup()) thisPileup = thisContext.getExtendedEventPileup(); @@ -102,14 +102,13 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot if (thisPileup != null) { if (vc.isSNP()) - scoreRA.add( scoreReadsAgainstHaplotypes(haplotypes, thisPileup, contextSize, locus) ); // Taking the simple average of all sample's score since the score can be negative and the RMS doesn't make sense + scoreRA.add(scoreReadsAgainstHaplotypes(haplotypes, thisPileup, contextSize, locus)); // Taking the simple average of all sample's score since the score can be negative and the RMS doesn't make sense else if (vc.isIndel() || vc.isMixed()) { Double d = scoreIndelsAgainstHaplotypes(thisPileup); if (d == null) return null; - scoreRA.add( d ); // Taking the simple average of all sample's score since the score can be negative and the RMS doesn't make sense - } - else + scoreRA.add(d); // Taking the simple average of all sample's score since the score can be negative and the RMS doesn't make sense + } else return null; } } @@ -122,12 +121,12 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot return map; } - private class HaplotypeComparator implements Comparator{ + private class HaplotypeComparator implements Comparator { public int compare(Haplotype a, Haplotype b) { if (a.getQualitySum() < b.getQualitySum()) return 1; - if (a.getQualitySum() > b.getQualitySum()){ + if (a.getQualitySum() > b.getQualitySum()) { return -1; } return 0; @@ -137,39 +136,38 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot private List computeHaplotypes(final ReadBackedPileup pileup, final int contextSize, final int locus, final VariantContext vc) { // Compute all possible haplotypes consistent with current pileup - int haplotypesToCompute = vc.getAlternateAlleles().size()+1; + int haplotypesToCompute = vc.getAlternateAlleles().size() + 1; final PriorityQueue candidateHaplotypeQueue = new PriorityQueue(100, new HaplotypeComparator()); final PriorityQueue consensusHaplotypeQueue = new PriorityQueue(MAX_CONSENSUS_HAPLOTYPES_TO_CONSIDER, new HaplotypeComparator()); - for ( final PileupElement p : pileup ) { + for (final PileupElement p : pileup) { final Haplotype haplotypeFromRead = getHaplotypeFromRead(p, contextSize, locus); candidateHaplotypeQueue.add(haplotypeFromRead); } // Now that priority queue has been built with all reads at context, we need to merge and find possible segregating haplotypes Haplotype elem; - while ((elem = candidateHaplotypeQueue.poll()) != null) { + while ((elem = candidateHaplotypeQueue.poll()) != null) { boolean foundHaplotypeMatch = false; Haplotype lastCheckedHaplotype = null; - for ( final Haplotype haplotypeFromList : consensusHaplotypeQueue ) { + for (final Haplotype haplotypeFromList : consensusHaplotypeQueue) { final Haplotype consensusHaplotype = getConsensusHaplotype(elem, haplotypeFromList); - if (consensusHaplotype != null) { + if (consensusHaplotype != null) { foundHaplotypeMatch = true; if (consensusHaplotype.getQualitySum() > haplotypeFromList.getQualitySum()) { consensusHaplotypeQueue.remove(haplotypeFromList); consensusHaplotypeQueue.add(consensusHaplotype); } break; - } - else { + } else { lastCheckedHaplotype = haplotypeFromList; } } if (!foundHaplotypeMatch && consensusHaplotypeQueue.size() < MAX_CONSENSUS_HAPLOTYPES_TO_CONSIDER) { consensusHaplotypeQueue.add(elem); - } else if (!foundHaplotypeMatch && lastCheckedHaplotype != null && elem.getQualitySum() > lastCheckedHaplotype.getQualitySum() ) { + } else if (!foundHaplotypeMatch && lastCheckedHaplotype != null && elem.getQualitySum() > lastCheckedHaplotype.getQualitySum()) { consensusHaplotypeQueue.remove(lastCheckedHaplotype); consensusHaplotypeQueue.add(elem); } @@ -180,12 +178,14 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot // The consensus haplotypes are in a quality-ordered priority queue, so the best haplotypes are just the ones at the front of the queue final Haplotype haplotype1 = consensusHaplotypeQueue.poll(); - Listhlist = new ArrayList(); + List hlist = new ArrayList(); hlist.add(new Haplotype(haplotype1.getBases(), 60)); - for (int k=1; k < haplotypesToCompute; k++) { + for (int k = 1; k < haplotypesToCompute; k++) { Haplotype haplotype2 = consensusHaplotypeQueue.poll(); - if(haplotype2 == null ) { haplotype2 = haplotype1; } // Sometimes only the reference haplotype can be found + if (haplotype2 == null) { + haplotype2 = haplotype1; + } // Sometimes only the reference haplotype can be found hlist.add(new Haplotype(haplotype2.getBases(), 20)); } return hlist; @@ -194,36 +194,43 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot } private Haplotype getHaplotypeFromRead(final PileupElement p, final int contextSize, final int locus) { - final SAMRecord read = p.getRead(); + final GATKSAMRecord read = p.getRead(); int readOffsetFromPileup = p.getOffset(); final byte[] haplotypeBases = new byte[contextSize]; - Arrays.fill(haplotypeBases, (byte)REGEXP_WILDCARD); + Arrays.fill(haplotypeBases, (byte) REGEXP_WILDCARD); final double[] baseQualities = new double[contextSize]; Arrays.fill(baseQualities, 0.0); byte[] readBases = read.getReadBases(); - readBases = AlignmentUtils.readToAlignmentByteArray(p.getRead().getCigar(), readBases); // Adjust the read bases based on the Cigar string + readBases = AlignmentUtils.readToAlignmentByteArray(read.getCigar(), readBases); // Adjust the read bases based on the Cigar string byte[] readQuals = read.getBaseQualities(); - readQuals = AlignmentUtils.readToAlignmentByteArray(p.getRead().getCigar(), readQuals); // Shift the location of the qual scores based on the Cigar string + readQuals = AlignmentUtils.readToAlignmentByteArray(read.getCigar(), readQuals); // Shift the location of the qual scores based on the Cigar string - readOffsetFromPileup = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), readOffsetFromPileup, p.getRead().getAlignmentStart(), locus); - final int baseOffsetStart = readOffsetFromPileup - (contextSize - 1)/2; + readOffsetFromPileup = AlignmentUtils.calcAlignmentByteArrayOffset(read.getCigar(), p, read.getAlignmentStart(), locus); + final int baseOffsetStart = readOffsetFromPileup - (contextSize - 1) / 2; - for (int i = 0; i < contextSize; i++ ) { + for (int i = 0; i < contextSize; i++) { final int baseOffset = i + baseOffsetStart; - if ( baseOffset < 0 ) { + if (baseOffset < 0) { continue; } - if ( baseOffset >= readBases.length ) { + if (baseOffset >= readBases.length) { break; } - if( readQuals[baseOffset] == PileupElement.DELETION_BASE) { readQuals[baseOffset] = PileupElement.DELETION_QUAL; } - if( !BaseUtils.isRegularBase(readBases[baseOffset]) ) { readBases[baseOffset] = (byte)REGEXP_WILDCARD; readQuals[baseOffset] = (byte) 0; } // N's shouldn't be treated as distinct bases - readQuals[baseOffset] = (byte)Math.min((int)readQuals[baseOffset], p.getMappingQual()); - if( ((int)readQuals[baseOffset]) < 5 ) { readQuals[baseOffset] = (byte) 0; } // quals less than 5 are used as codes and don't have actual probabilistic meaning behind them + if (readQuals[baseOffset] == PileupElement.DELETION_BASE) { + readQuals[baseOffset] = PileupElement.DELETION_QUAL; + } + if (!BaseUtils.isRegularBase(readBases[baseOffset])) { + readBases[baseOffset] = (byte) REGEXP_WILDCARD; + readQuals[baseOffset] = (byte) 0; + } // N's shouldn't be treated as distinct bases + readQuals[baseOffset] = (byte) Math.min((int) readQuals[baseOffset], p.getMappingQual()); + if (((int) readQuals[baseOffset]) < 5) { + readQuals[baseOffset] = (byte) 0; + } // quals less than 5 are used as codes and don't have actual probabilistic meaning behind them haplotypeBases[i] = readBases[baseOffset]; - baseQualities[i] = (double)readQuals[baseOffset]; + baseQualities[i] = (double) readQuals[baseOffset]; } return new Haplotype(haplotypeBases, baseQualities); @@ -238,7 +245,7 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot } byte chA, chB; - final byte wc = (byte)REGEXP_WILDCARD; + final byte wc = (byte) REGEXP_WILDCARD; final int length = a.length; final byte[] consensusChars = new byte[length]; @@ -247,7 +254,7 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot final double[] qualsA = haplotypeA.getQuals(); final double[] qualsB = haplotypeB.getQuals(); - for (int i=0; i < length; i++) { + for (int i = 0; i < length; i++) { chA = a[i]; chB = b[i]; @@ -257,17 +264,15 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot if ((chA == wc) && (chB == wc)) { consensusChars[i] = wc; consensusQuals[i] = 0.0; - } - else if ((chA == wc)) { + } else if ((chA == wc)) { consensusChars[i] = chB; consensusQuals[i] = qualsB[i]; - } - else if ((chB == wc)){ + } else if ((chB == wc)) { consensusChars[i] = chA; consensusQuals[i] = qualsA[i]; } else { consensusChars[i] = chA; - consensusQuals[i] = qualsA[i]+qualsB[i]; + consensusQuals[i] = qualsA[i] + qualsB[i]; } } @@ -276,31 +281,33 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot // calculate the haplotype scores by walking over all reads and comparing them to the haplotypes private double scoreReadsAgainstHaplotypes(final List haplotypes, final ReadBackedPileup pileup, final int contextSize, final int locus) { - if ( DEBUG ) System.out.printf("HAP1: %s%n", haplotypes.get(0)); - if ( DEBUG ) System.out.printf("HAP2: %s%n", haplotypes.get(1)); + if (DEBUG) System.out.printf("HAP1: %s%n", haplotypes.get(0)); + if (DEBUG) System.out.printf("HAP2: %s%n", haplotypes.get(1)); final ArrayList haplotypeScores = new ArrayList(); - for ( final PileupElement p : pileup ) { + for (final PileupElement p : pileup) { // Score all the reads in the pileup, even the filtered ones final double[] scores = new double[haplotypes.size()]; - for ( int i = 0; i < haplotypes.size(); i++ ) { + for (int i = 0; i < haplotypes.size(); i++) { final Haplotype haplotype = haplotypes.get(i); final double score = scoreReadAgainstHaplotype(p, contextSize, haplotype, locus); scores[i] = score; - if ( DEBUG ) { System.out.printf(" vs. haplotype %d = %f%n", i, score); } + if (DEBUG) { + System.out.printf(" vs. haplotype %d = %f%n", i, score); + } } haplotypeScores.add(scores); } double overallScore = 0.0; - for ( final double[] readHaplotypeScores : haplotypeScores ) { + for (final double[] readHaplotypeScores : haplotypeScores) { overallScore += MathUtils.arrayMin(readHaplotypeScores); } return overallScore; } - private double scoreReadAgainstHaplotype(final PileupElement p, final int contextSize, final Haplotype haplotype, final int locus ) { + private double scoreReadAgainstHaplotype(final PileupElement p, final int contextSize, final Haplotype haplotype, final int locus) { double expected = 0.0; double mismatches = 0.0; @@ -315,33 +322,35 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot // the chance that it is actually a mismatch is 1 - e, since any of the other 3 options would be a mismatch. // so the probability-weighted mismatch rate is sum_i ( matched ? e_i / 3 : 1 - e_i ) for i = 1 ... n final byte[] haplotypeBases = haplotype.getBases(); - final SAMRecord read = p.getRead(); + final GATKSAMRecord read = p.getRead(); byte[] readBases = read.getReadBases(); readBases = AlignmentUtils.readToAlignmentByteArray(p.getRead().getCigar(), readBases); // Adjust the read bases based on the Cigar string byte[] readQuals = read.getBaseQualities(); readQuals = AlignmentUtils.readToAlignmentByteArray(p.getRead().getCigar(), readQuals); // Shift the location of the qual scores based on the Cigar string int readOffsetFromPileup = p.getOffset(); - readOffsetFromPileup = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), readOffsetFromPileup, p.getRead().getAlignmentStart(), locus); - final int baseOffsetStart = readOffsetFromPileup - (contextSize - 1)/2; + readOffsetFromPileup = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), p, read.getAlignmentStart(), locus); + final int baseOffsetStart = readOffsetFromPileup - (contextSize - 1) / 2; - for ( int i = 0; i < contextSize; i++ ) { + for (int i = 0; i < contextSize; i++) { final int baseOffset = i + baseOffsetStart; - if ( baseOffset < 0 ) { + if (baseOffset < 0) { continue; } - if ( baseOffset >= readBases.length ) { + if (baseOffset >= readBases.length) { break; } final byte haplotypeBase = haplotypeBases[i]; final byte readBase = readBases[baseOffset]; - final boolean matched = ( readBase == haplotypeBase || haplotypeBase == (byte)REGEXP_WILDCARD ); + final boolean matched = (readBase == haplotypeBase || haplotypeBase == (byte) REGEXP_WILDCARD); byte qual = readQuals[baseOffset]; - if( qual == PileupElement.DELETION_BASE ) { qual = PileupElement.DELETION_QUAL; } // calcAlignmentByteArrayOffset fills the readQuals array with DELETION_BASE at deletions - qual = (byte)Math.min((int)qual, p.getMappingQual()); - if( ((int) qual) >= 5 ) { // quals less than 5 are used as codes and don't have actual probabilistic meaning behind them + if (qual == PileupElement.DELETION_BASE) { + qual = PileupElement.DELETION_QUAL; + } // calcAlignmentByteArrayOffset fills the readQuals array with DELETION_BASE at deletions + qual = (byte) Math.min((int) qual, p.getMappingQual()); + if (((int) qual) >= 5) { // quals less than 5 are used as codes and don't have actual probabilistic meaning behind them final double e = QualityUtils.qualToErrorProb(qual); expected += e; mismatches += matched ? e : 1.0 - e / 3.0; @@ -355,26 +364,27 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot } - private Double scoreIndelsAgainstHaplotypes(final ReadBackedPileup pileup) { final ArrayList haplotypeScores = new ArrayList(); - final HashMap> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(); + final HashMap> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(); - if (indelLikelihoodMap== null) + if (indelLikelihoodMap == null) return null; - for (final PileupElement p: pileup) { + for (final PileupElement p : pileup) { if (indelLikelihoodMap.containsKey(p)) { // retrieve likelihood information corresponding to this read - LinkedHashMap el = indelLikelihoodMap.get(p); + LinkedHashMap el = indelLikelihoodMap.get(p); // Score all the reads in the pileup, even the filtered ones final double[] scores = new double[el.size()]; int i = 0; - for (Allele a: el.keySet() ) { + for (Allele a : el.keySet()) { scores[i++] = -el.get(a); - if ( DEBUG ) { System.out.printf(" vs. haplotype %d = %f%n", i-1, scores[i-1]); } + if (DEBUG) { + System.out.printf(" vs. haplotype %d = %f%n", i - 1, scores[i - 1]); + } } haplotypeScores.add(scores); @@ -383,7 +393,7 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot // indel likelihoods are stric log-probs, not phred scored double overallScore = 0.0; - for ( final double[] readHaplotypeScores : haplotypeScores ) { + for (final double[] readHaplotypeScores : haplotypeScores) { overallScore += MathUtils.arrayMin(readHaplotypeScores); } @@ -392,6 +402,11 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot } - public List getKeyNames() { return Arrays.asList("HaplotypeScore"); } - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("HaplotypeScore", 1, VCFHeaderLineType.Float, "Consistency of the site with at most two segregating haplotypes")); } + public List getKeyNames() { + return Arrays.asList("HaplotypeScore"); + } + + public List getDescriptions() { + return Arrays.asList(new VCFInfoHeaderLine("HaplotypeScore", 1, VCFHeaderLineType.Float, "Consistency of the site with at most two segregating haplotypes")); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java index c5a2df1fd..e0e62cdb0 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java @@ -30,11 +30,11 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements Standar static final boolean DEBUG = false; public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { - if ( stratifiedContexts.size() == 0 ) + if (stratifiedContexts.size() == 0) return null; - + final GenotypesContext genotypes = vc.getGenotypes(); - if ( genotypes == null || genotypes.size() == 0 ) + if (genotypes == null || genotypes.size() == 0) return null; @@ -43,19 +43,18 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements Standar if (vc.isSNP() && vc.isBiallelic()) { // todo - no current support for multiallelic snps - for ( final Genotype genotype : genotypes.iterateInSampleNameOrder() ) { + for (final Genotype genotype : genotypes.iterateInSampleNameOrder()) { final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); - if ( context == null ) { + if (context == null) { continue; } fillQualsFromPileup(ref.getBase(), vc.getAlternateAllele(0).getBases()[0], context.getBasePileup(), refQuals, altQuals); } - } - else if (vc.isIndel() || vc.isMixed()) { + } else if (vc.isIndel() || vc.isMixed()) { - for ( final Genotype genotype : genotypes.iterateInSampleNameOrder() ) { + for (final Genotype genotype : genotypes.iterateInSampleNameOrder()) { final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); - if ( context == null ) { + if (context == null) { continue; } @@ -74,46 +73,47 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements Standar fillIndelQualsFromPileup(pileup, refQuals, altQuals); } - } - else + } else return null; final MannWhitneyU mannWhitneyU = new MannWhitneyU(); - for ( final Double qual : altQuals ) { + for (final Double qual : altQuals) { mannWhitneyU.add(qual, MannWhitneyU.USet.SET1); } - for ( final Double qual : refQuals ) { + for (final Double qual : refQuals) { mannWhitneyU.add(qual, MannWhitneyU.USet.SET2); } if (DEBUG) { - System.out.format("%s, REF QUALS:",this.getClass().getName()); - for ( final Double qual : refQuals ) - System.out.format("%4.1f ",qual); + System.out.format("%s, REF QUALS:", this.getClass().getName()); + for (final Double qual : refQuals) + System.out.format("%4.1f ", qual); System.out.println(); - System.out.format("%s, ALT QUALS:",this.getClass().getName()); - for ( final Double qual : altQuals ) - System.out.format("%4.1f ",qual); + System.out.format("%s, ALT QUALS:", this.getClass().getName()); + for (final Double qual : altQuals) + System.out.format("%4.1f ", qual); System.out.println(); } // we are testing that set1 (the alt bases) have lower quality scores than set2 (the ref bases) - final Pair testResults = mannWhitneyU.runOneSidedTest( MannWhitneyU.USet.SET1 ); + final Pair testResults = mannWhitneyU.runOneSidedTest(MannWhitneyU.USet.SET1); final Map map = new HashMap(); - if ( ! Double.isNaN(testResults.first) ) + if (!Double.isNaN(testResults.first)) map.put(getKeyNames().get(0), String.format("%.3f", testResults.first)); return map; } protected abstract void fillQualsFromPileup(byte ref, byte alt, ReadBackedPileup pileup, List refQuals, List altQuals); + protected abstract void fillIndelQualsFromPileup(ReadBackedPileup pileup, List refQuals, List altQuals); - protected static boolean isUsableBase( final PileupElement p ) { - return !( p.isDeletion() || - p.getMappingQual() == 0 || - p.getMappingQual() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE || - ((int)p.getQual()) < QualityUtils.MIN_USABLE_Q_SCORE ); // need the unBAQed quality score here + protected static boolean isUsableBase(final PileupElement p) { + return !(p.isInsertionAtBeginningOfRead() || + p.isDeletion() || + p.getMappingQual() == 0 || + p.getMappingQual() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE || + ((int) p.getQual()) < QualityUtils.MIN_USABLE_Q_SCORE); // need the unBAQed quality score here } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java index d762af428..b0039d1a0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java @@ -24,27 +24,32 @@ import java.util.List; */ public class ReadPosRankSumTest extends RankSumTest { - public List getKeyNames() { return Arrays.asList("ReadPosRankSum"); } + public List getKeyNames() { + return Arrays.asList("ReadPosRankSum"); + } - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("ReadPosRankSum", 1, VCFHeaderLineType.Float, "Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias")); } + public List getDescriptions() { + return Arrays.asList(new VCFInfoHeaderLine("ReadPosRankSum", 1, VCFHeaderLineType.Float, "Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias")); + } protected void fillQualsFromPileup(byte ref, byte alt, ReadBackedPileup pileup, List refQuals, List altQuals) { - for ( final PileupElement p : pileup ) { - if( isUsableBase(p) ) { - int readPos = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), p.getOffset(), 0, 0); + for (final PileupElement p : pileup) { + if (isUsableBase(p)) { + int readPos = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), p, 0, 0); final int numAlignedBases = AlignmentUtils.getNumAlignedBases(p.getRead()); - if( readPos > numAlignedBases / 2 ) { - readPos = numAlignedBases - ( readPos + 1 ); - } + if (readPos > numAlignedBases / 2) + readPos = numAlignedBases - (readPos + 1); + + + if (p.getBase() == ref) + refQuals.add((double) readPos); + else if (p.getBase() == alt) + altQuals.add((double) readPos); - if ( p.getBase() == ref ) { - refQuals.add( (double)readPos ); - } else if ( p.getBase() == alt ) { - altQuals.add( (double)readPos ); - } } } } + protected void fillIndelQualsFromPileup(ReadBackedPileup pileup, List refQuals, List altQuals) { // equivalent is whether indel likelihoods for reads corresponding to ref allele are more likely than reads corresponding to alt allele // to classify a pileup element as ref or alt, we look at the likelihood associated with the allele associated to this element. @@ -52,18 +57,15 @@ public class ReadPosRankSumTest extends RankSumTest { // To classify a pileup element as Ref or Alt, we look at the likelihood of corresponding alleles. // If likelihood of ref allele > highest likelihood of all alt alleles + epsilon, then this pielup element is "ref" // otherwise if highest alt allele likelihood is > ref likelihood + epsilon, then this pileup element it "alt" - final HashMap> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(); - for (final PileupElement p: pileup) { + final HashMap> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(); + for (final PileupElement p : pileup) { if (indelLikelihoodMap.containsKey(p)) { - // retrieve likelihood information corresponding to this read - LinkedHashMap el = indelLikelihoodMap.get(p); - // by design, first element in LinkedHashMap was ref allele - double refLikelihood=0.0, altLikelihood=Double.NEGATIVE_INFINITY; + LinkedHashMap el = indelLikelihoodMap.get(p); // retrieve likelihood information corresponding to this read + double refLikelihood = 0.0, altLikelihood = Double.NEGATIVE_INFINITY; // by design, first element in LinkedHashMap was ref allele for (Allele a : el.keySet()) { - if (a.isReference()) - refLikelihood =el.get(a); + refLikelihood = el.get(a); else { double like = el.get(a); if (like >= altLikelihood) @@ -75,23 +77,22 @@ public class ReadPosRankSumTest extends RankSumTest { final int numAlignedBases = getNumAlignedBases(p.getRead()); int rp = readPos; - if( readPos > numAlignedBases / 2 ) { - readPos = numAlignedBases - ( readPos + 1 ); + if (readPos > numAlignedBases / 2) { + readPos = numAlignedBases - (readPos + 1); } - //if (DEBUG) System.out.format("R:%s start:%d C:%s offset:%d rp:%d readPos:%d alignedB:%d\n",p.getRead().getReadName(),p.getRead().getAlignmentStart(),p.getRead().getCigarString(),p.getOffset(), rp, readPos, numAlignedBases); + //if (DEBUG) System.out.format("R:%s start:%d C:%s offset:%d rp:%d readPos:%d alignedB:%d\n",p.getRead().getReadName(),p.getRead().getAlignmentStart(),p.getRead().getCigarString(),p.getOffset(), rp, readPos, numAlignedBases); // if event is beyond span of read just return and don't consider this element. This can happen, for example, with reads // where soft clipping still left strings of low quality bases but these are later removed by indel-specific clipping. - // if (readPos < -1) + // if (readPos < -1) // return; - if (refLikelihood > (altLikelihood + INDEL_LIKELIHOOD_THRESH)) { - refQuals.add((double)readPos); + if (refLikelihood > (altLikelihood + INDEL_LIKELIHOOD_THRESH)) { + refQuals.add((double) readPos); //if (DEBUG) System.out.format("REF like: %4.1f, pos: %d\n",refLikelihood,readPos); - } - else if (altLikelihood > (refLikelihood + INDEL_LIKELIHOOD_THRESH)) { - altQuals.add((double)readPos); - //if (DEBUG) System.out.format("ALT like: %4.1f, pos: %d\n",refLikelihood,readPos); + } else if (altLikelihood > (refLikelihood + INDEL_LIKELIHOOD_THRESH)) { + altQuals.add((double) readPos); + //if (DEBUG) System.out.format("ALT like: %4.1f, pos: %d\n",refLikelihood,readPos); } @@ -115,7 +116,7 @@ public class ReadPosRankSumTest extends RankSumTest { // Do a stricter base clipping than provided by CIGAR string, since this one may be too conservative, // and may leave a string of Q2 bases still hanging off the reads. - for (int i=numStartClippedBases; i < unclippedReadBases.length; i++) { + for (int i = numStartClippedBases; i < unclippedReadBases.length; i++) { if (unclippedReadQuals[i] < PairHMMIndelErrorModel.BASE_QUAL_THRESHOLD) numStartClippedBases++; else @@ -134,7 +135,7 @@ public class ReadPosRankSumTest extends RankSumTest { // compute total number of clipped bases (soft or hard clipped) // check for hard clips (never consider these bases): final Cigar c = read.getCigar(); - CigarElement last = c.getCigarElement(c.numCigarElements()-1); + CigarElement last = c.getCigarElement(c.numCigarElements() - 1); int numEndClippedBases = 0; if (last.getOperator() == CigarOperator.H) { @@ -145,7 +146,7 @@ public class ReadPosRankSumTest extends RankSumTest { // Do a stricter base clipping than provided by CIGAR string, since this one may be too conservative, // and may leave a string of Q2 bases still hanging off the reads. - for (int i=unclippedReadBases.length-numEndClippedBases-1; i >= 0; i-- ){ + for (int i = unclippedReadBases.length - numEndClippedBases - 1; i >= 0; i--) { if (unclippedReadQuals[i] < PairHMMIndelErrorModel.BASE_QUAL_THRESHOLD) numEndClippedBases++; else @@ -157,8 +158,6 @@ public class ReadPosRankSumTest extends RankSumTest { } int getOffsetFromClippedReadStart(SAMRecord read, int offset) { - - - return offset - getNumClippedBasesAtStart(read); + return offset - getNumClippedBasesAtStart(read); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java index ae7077230..7143606ae 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java @@ -278,7 +278,7 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable { if ( qual == 0 ) return 0; - if ( elt.isReducedRead() ) { + if ( elt.getRead().isReducedRead() ) { // reduced read representation if ( BaseUtils.isRegularBase( obsBase )) { int representativeCount = elt.getRepresentativeCount(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index 0756caf03..9126c0495 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -60,14 +60,14 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood private final int maxAlternateAlleles; private PairHMMIndelErrorModel pairModel; - private static ThreadLocal>> indelLikelihoodMap = - new ThreadLocal>>() { - protected synchronized HashMap> initialValue() { - return new HashMap>(); + private static ThreadLocal>> indelLikelihoodMap = + new ThreadLocal>>() { + protected synchronized HashMap> initialValue() { + return new HashMap>(); } }; - private LinkedHashMap haplotypeMap; + private LinkedHashMap haplotypeMap; // gdebug removeme // todo -cleanup @@ -75,13 +75,13 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood private ArrayList alleleList; static { - indelLikelihoodMap.set(new HashMap>()); + indelLikelihoodMap.set(new HashMap>()); } protected IndelGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { super(UAC, logger); - pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY,UAC.INDEL_GAP_CONTINUATION_PENALTY, + pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY, UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION); alleleList = new ArrayList(); getAlleleListFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES; @@ -91,7 +91,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood maxAlternateAlleles = UAC.MAX_ALTERNATE_ALLELES; doMultiAllelicCalls = UAC.MULTI_ALLELIC; - haplotypeMap = new LinkedHashMap(); + haplotypeMap = new LinkedHashMap(); ignoreSNPAllelesWhenGenotypingIndels = UAC.IGNORE_SNP_ALLELES; } @@ -99,15 +99,15 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood private ArrayList computeConsensusAlleles(ReferenceContext ref, Map contexts, AlignmentContextUtils.ReadOrientation contextType, GenomeLocParser locParser) { - Allele refAllele=null, altAllele=null; + Allele refAllele = null, altAllele = null; GenomeLoc loc = ref.getLocus(); ArrayList aList = new ArrayList(); - HashMap consensusIndelStrings = new HashMap(); + HashMap consensusIndelStrings = new HashMap(); int insCount = 0, delCount = 0; // quick check of total number of indels in pileup - for ( Map.Entry sample : contexts.entrySet() ) { + for (Map.Entry sample : contexts.entrySet()) { AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup(); @@ -118,21 +118,19 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood if (insCount < minIndelCountForGenotyping && delCount < minIndelCountForGenotyping) return aList; - for ( Map.Entry sample : contexts.entrySet() ) { + for (Map.Entry sample : contexts.entrySet()) { // todo -- warning, can be duplicating expensive partition here AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup(); - - - for ( ExtendedEventPileupElement p : indelPileup.toExtendedIterable() ) { + for (ExtendedEventPileupElement p : indelPileup.toExtendedIterable()) { //SAMRecord read = p.getRead(); GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead()); if (read == null) continue; - if(ReadUtils.is454Read(read)) { + if (ReadUtils.is454Read(read)) { continue; } @@ -151,62 +149,57 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood // In this case, the read could have any of the inserted bases and we need to build a consensus for (String s : consensusIndelStrings.keySet()) { int cnt = consensusIndelStrings.get(s); - if (s.startsWith(indelString)){ + if (s.startsWith(indelString)) { // case 1: current insertion is prefix of indel in hash map - consensusIndelStrings.put(s,cnt+1); + consensusIndelStrings.put(s, cnt + 1); foundKey = true; break; - } - else if (indelString.startsWith(s)) { + } else if (indelString.startsWith(s)) { // case 2: indel stored in hash table is prefix of current insertion // In this case, new bases are new key. consensusIndelStrings.remove(s); - consensusIndelStrings.put(indelString,cnt+1); + consensusIndelStrings.put(indelString, cnt + 1); foundKey = true; break; } } if (!foundKey) // none of the above: event bases not supported by previous table, so add new key - consensusIndelStrings.put(indelString,1); + consensusIndelStrings.put(indelString, 1); - } - else if (read.getAlignmentStart() == loc.getStart()+1) { + } else if (read.getAlignmentStart() == loc.getStart() + 1) { // opposite corner condition: read will start at current locus with an insertion for (String s : consensusIndelStrings.keySet()) { int cnt = consensusIndelStrings.get(s); - if (s.endsWith(indelString)){ + if (s.endsWith(indelString)) { // case 1: current insertion is suffix of indel in hash map - consensusIndelStrings.put(s,cnt+1); + consensusIndelStrings.put(s, cnt + 1); foundKey = true; break; - } - else if (indelString.endsWith(s)) { + } else if (indelString.endsWith(s)) { // case 2: indel stored in hash table is suffix of current insertion // In this case, new bases are new key. consensusIndelStrings.remove(s); - consensusIndelStrings.put(indelString,cnt+1); + consensusIndelStrings.put(indelString, cnt + 1); foundKey = true; break; } } if (!foundKey) // none of the above: event bases not supported by previous table, so add new key - consensusIndelStrings.put(indelString,1); + consensusIndelStrings.put(indelString, 1); - } - else { + } else { // normal case: insertion somewhere in the middle of a read: add count to hash map - int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0; - consensusIndelStrings.put(indelString,cnt+1); + int cnt = consensusIndelStrings.containsKey(indelString) ? consensusIndelStrings.get(indelString) : 0; + consensusIndelStrings.put(indelString, cnt + 1); } - } - else if (p.isDeletion()) { - indelString = String.format("D%d",p.getEventLength()); - int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0; - consensusIndelStrings.put(indelString,cnt+1); + } else if (p.isDeletion()) { + indelString = String.format("D%d", p.getEventLength()); + int cnt = consensusIndelStrings.containsKey(indelString) ? consensusIndelStrings.get(indelString) : 0; + consensusIndelStrings.put(indelString, cnt + 1); } } @@ -227,18 +220,17 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood // get deletion length int dLen = Integer.valueOf(s.substring(1)); // get ref bases of accurate deletion - int startIdxInReference = 1+loc.getStart()-ref.getWindow().getStart(); + int startIdxInReference = 1 + loc.getStart() - ref.getWindow().getStart(); stop = loc.getStart() + dLen; - byte[] refBases = Arrays.copyOfRange(ref.getBases(),startIdxInReference,startIdxInReference+dLen); + byte[] refBases = Arrays.copyOfRange(ref.getBases(), startIdxInReference, startIdxInReference + dLen); if (Allele.acceptableAlleleBases(refBases)) { - refAllele = Allele.create(refBases,true); + refAllele = Allele.create(refBases, true); altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false); } - } - else { + } else { // insertion case - if (Allele.acceptableAlleleBases(s)) { + if (Allele.acceptableAlleleBases(s)) { refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true); altAllele = Allele.create(s, false); stop = loc.getStart(); @@ -288,7 +280,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood Allele alternateAlleleToUse, boolean useBAQedPileup, GenomeLocParser locParser) { - if ( tracker == null ) + if (tracker == null) return null; GenomeLoc loc = ref.getLocus(); @@ -299,12 +291,12 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood // starting a new site: clear allele list alleleList.clear(); lastSiteVisited = ref.getLocus(); - indelLikelihoodMap.set(new HashMap>()); + indelLikelihoodMap.set(new HashMap>()); haplotypeMap.clear(); if (getAlleleListFromVCF) { - for( final VariantContext vc_input : tracker.getValues(UAC.alleles, loc) ) { - if( vc_input != null && + for (final VariantContext vc_input : tracker.getValues(UAC.alleles, loc)) { + if (vc_input != null && allowableTypes.contains(vc_input.getType()) && ref.getLocus().getStart() == vc_input.getStart()) { vc = vc_input; @@ -312,7 +304,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood } } // ignore places where we don't have a variant - if ( vc == null ) + if (vc == null) return null; alleleList.clear(); @@ -324,15 +316,13 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood else alleleList.add(a); - } - else { + } else { for (Allele a : vc.getAlleles()) alleleList.add(a); } - } - else { - alleleList = computeConsensusAlleles(ref,contexts, contextType, locParser); + } else { + alleleList = computeConsensusAlleles(ref, contexts, contextType, locParser); if (alleleList.isEmpty()) return null; } @@ -342,9 +332,9 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood return null; // check if there is enough reference window to create haplotypes (can be an issue at end of contigs) - if (ref.getWindow().getStop() < loc.getStop()+HAPLOTYPE_SIZE) + if (ref.getWindow().getStop() < loc.getStop() + HAPLOTYPE_SIZE) return null; - if ( !(priors instanceof DiploidIndelGenotypePriors) ) + if (!(priors instanceof DiploidIndelGenotypePriors)) throw new StingException("Only diploid-based Indel priors are supported in the DINDEL GL model"); if (alleleList.isEmpty()) @@ -355,8 +345,8 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood // look for alt allele that has biggest length distance to ref allele int maxLenDiff = 0; - for (Allele a: alleleList) { - if(a.isNonReference()) { + for (Allele a : alleleList) { + if (a.isNonReference()) { int lenDiff = Math.abs(a.getBaseString().length() - refAllele.getBaseString().length()); if (lenDiff > maxLenDiff) { maxLenDiff = lenDiff; @@ -366,11 +356,11 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood } final int eventLength = altAllele.getBaseString().length() - refAllele.getBaseString().length(); - final int hsize = (int)ref.getWindow().size()-Math.abs(eventLength)-1; - final int numPrefBases= ref.getLocus().getStart()-ref.getWindow().getStart()+1; + final int hsize = (int) ref.getWindow().size() - Math.abs(eventLength) - 1; + final int numPrefBases = ref.getLocus().getStart() - ref.getWindow().getStart() + 1; - if (hsize <=0) { - logger.warn(String.format("Warning: event at location %s can't be genotyped, skipping",loc.toString())); + if (hsize <= 0) { + logger.warn(String.format("Warning: event at location %s can't be genotyped, skipping", loc.toString())); return null; } haplotypeMap = Haplotype.makeHaplotypeListFromAlleles(alleleList, loc.getStart(), @@ -388,7 +378,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood // For each sample, get genotype likelihoods based on pileup // compute prior likelihoods on haplotypes, and initialize haplotype likelihood matrix with them. - for ( Map.Entry sample : contexts.entrySet() ) { + for (Map.Entry sample : contexts.entrySet()) { AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); ReadBackedPileup pileup = null; @@ -397,8 +387,8 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood else if (context.hasBasePileup()) pileup = context.getBasePileup(); - if (pileup != null ) { - final double[] genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods( pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap()); + if (pileup != null) { + final double[] genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap()); GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(genotypeLikelihoods); HashMap attributes = new HashMap(); @@ -407,9 +397,9 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood genotypes.add(new Genotype(sample.getKey(), noCall, Genotype.NO_LOG10_PERROR, null, attributes, false)); if (DEBUG) { - System.out.format("Sample:%s Alleles:%s GL:",sample.getKey(), alleleList.toString()); - for (int k=0; k < genotypeLikelihoods.length; k++) - System.out.format("%1.4f ",genotypeLikelihoods[k]); + System.out.format("Sample:%s Alleles:%s GL:", sample.getKey(), alleleList.toString()); + for (int k = 0; k < genotypeLikelihoods.length; k++) + System.out.format("%1.4f ", genotypeLikelihoods[k]); System.out.println(); } } @@ -421,21 +411,21 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood private int calculateEndPos(Collection alleles, Allele refAllele, GenomeLoc loc) { // for indels, stop location is one more than ref allele length boolean hasNullAltAllele = false; - for ( Allele a : alleles ) { - if ( a.isNull() ) { + for (Allele a : alleles) { + if (a.isNull()) { hasNullAltAllele = true; break; } } int endLoc = loc.getStart() + refAllele.length(); - if( !hasNullAltAllele ) + if (!hasNullAltAllele) endLoc--; return endLoc; } - public static HashMap> getIndelLikelihoodMap() { + public static HashMap> getIndelLikelihoodMap() { return indelLikelihoodMap.get(); } @@ -443,8 +433,8 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood // so that per-sample DP will include deletions covering the event. protected int getFilteredDepth(ReadBackedPileup pileup) { int count = 0; - for ( PileupElement p : pileup ) { - if (p.isDeletion() || BaseUtils.isRegularBase(p.getBase()) ) + for (PileupElement p : pileup) { + if (p.isDeletion() || p.isInsertionAtBeginningOfRead() || BaseUtils.isRegularBase(p.getBase())) count++; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java index 81c766e4d..d9ee2ba1b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java @@ -212,7 +212,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC public class BAQedPileupElement extends PileupElement { public BAQedPileupElement( final PileupElement PE ) { - super(PE.getRead(), PE.getOffset()); + super(PE.getRead(), PE.getOffset(), PE.isDeletion()); } @Override diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java index 586b86490..1fa7101ca 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java @@ -40,7 +40,7 @@ import java.util.*; * @author mhanna * @version 0.1 */ -public abstract class AbstractReadBackedPileup,PE extends PileupElement> implements ReadBackedPileup { +public abstract class AbstractReadBackedPileup, PE extends PileupElement> implements ReadBackedPileup { protected final GenomeLoc loc; protected final PileupElementTracker pileupElementTracker; @@ -55,23 +55,18 @@ public abstract class AbstractReadBackedPileup reads, List offsets ) { + public AbstractReadBackedPileup(GenomeLoc loc, List reads, List offsets) { this.loc = loc; - this.pileupElementTracker = readsOffsets2Pileup(reads,offsets); + this.pileupElementTracker = readsOffsets2Pileup(reads, offsets); } - public AbstractReadBackedPileup(GenomeLoc loc, List reads, int offset ) { - this.loc = loc; - this.pileupElementTracker = readsOffsets2Pileup(reads,offset); - } /** * Create a new version of a read backed pileup at loc without any aligned reads - * */ public AbstractReadBackedPileup(GenomeLoc loc) { this(loc, new UnifiedPileupElementTracker()); @@ -81,11 +76,10 @@ public abstract class AbstractReadBackedPileup pileup) { - if ( loc == null ) throw new ReviewedStingException("Illegal null genomeloc in ReadBackedPileup"); - if ( pileup == null ) throw new ReviewedStingException("Illegal null pileup in ReadBackedPileup"); + if (loc == null) throw new ReviewedStingException("Illegal null genomeloc in ReadBackedPileup"); + if (pileup == null) throw new ReviewedStingException("Illegal null pileup in ReadBackedPileup"); this.loc = loc; this.pileupElementTracker = new UnifiedPileupElementTracker(pileup); @@ -94,12 +88,13 @@ public abstract class AbstractReadBackedPileup pileup, int size, int nDeletions, int nMQ0Reads) { - if ( loc == null ) throw new ReviewedStingException("Illegal null genomeloc in UnifiedReadBackedPileup"); - if ( pileup == null ) throw new ReviewedStingException("Illegal null pileup in UnifiedReadBackedPileup"); + if (loc == null) throw new ReviewedStingException("Illegal null genomeloc in UnifiedReadBackedPileup"); + if (pileup == null) throw new ReviewedStingException("Illegal null pileup in UnifiedReadBackedPileup"); this.loc = loc; this.pileupElementTracker = new UnifiedPileupElementTracker(pileup); @@ -115,16 +110,21 @@ public abstract class AbstractReadBackedPileup> pileupsBySample) { + protected AbstractReadBackedPileup(GenomeLoc loc, Map> pileupsBySample) { this.loc = loc; PerSamplePileupElementTracker tracker = new PerSamplePileupElementTracker(); - for(Map.Entry> pileupEntry: pileupsBySample.entrySet()) { - tracker.addElements(pileupEntry.getKey(),pileupEntry.getValue().pileupElementTracker); + for (Map.Entry> pileupEntry : pileupsBySample.entrySet()) { + tracker.addElements(pileupEntry.getKey(), pileupEntry.getValue().pileupElementTracker); addPileupToCumulativeStats(pileupEntry.getValue()); } this.pileupElementTracker = tracker; } + public AbstractReadBackedPileup(GenomeLoc loc, List reads, int offset) { + this.loc = loc; + this.pileupElementTracker = readsOffsets2Pileup(reads, offset); + } + /** * Calculate cached sizes, nDeletion, and base counts for the pileup. This calculation is done upfront, * so you pay the cost at the start, but it's more efficient to do this rather than pay the cost of calling @@ -135,12 +135,12 @@ public abstract class AbstractReadBackedPileup pileup) { + protected void addPileupToCumulativeStats(AbstractReadBackedPileup pileup) { size += pileup.getNumberOfElements(); abstractSize += pileup.depthOfCoverage(); nDeletions += pileup.getNumberOfDeletions(); @@ -167,14 +167,17 @@ public abstract class AbstractReadBackedPileup readsOffsets2Pileup(List reads, List offsets ) { - if ( reads == null ) throw new ReviewedStingException("Illegal null read list in UnifiedReadBackedPileup"); - if ( offsets == null ) throw new ReviewedStingException("Illegal null offsets list in UnifiedReadBackedPileup"); - if ( reads.size() != offsets.size() ) throw new ReviewedStingException("Reads and offset lists have different sizes!"); + private PileupElementTracker readsOffsets2Pileup(List reads, List offsets) { + if (reads == null) throw new ReviewedStingException("Illegal null read list in UnifiedReadBackedPileup"); + if (offsets == null) throw new ReviewedStingException("Illegal null offsets list in UnifiedReadBackedPileup"); + if (reads.size() != offsets.size()) + throw new ReviewedStingException("Reads and offset lists have different sizes!"); UnifiedPileupElementTracker pileup = new UnifiedPileupElementTracker(); - for ( int i = 0; i < reads.size(); i++ ) { - pileup.add(createNewPileupElement(reads.get(i),offsets.get(i))); + for (int i = 0; i < reads.size(); i++) { + GATKSAMRecord read = reads.get(i); + int offset = offsets.get(i); + pileup.add(createNewPileupElement(read, offset, BaseUtils.simpleBaseToBaseIndex(read.getReadBases()[offset]) == BaseUtils.D)); } return pileup; @@ -187,20 +190,21 @@ public abstract class AbstractReadBackedPileup readsOffsets2Pileup(List reads, int offset ) { - if ( reads == null ) throw new ReviewedStingException("Illegal null read list in UnifiedReadBackedPileup"); - if ( offset < 0 ) throw new ReviewedStingException("Illegal offset < 0 UnifiedReadBackedPileup"); + private PileupElementTracker readsOffsets2Pileup(List reads, int offset) { + if (reads == null) throw new ReviewedStingException("Illegal null read list in UnifiedReadBackedPileup"); + if (offset < 0) throw new ReviewedStingException("Illegal offset < 0 UnifiedReadBackedPileup"); UnifiedPileupElementTracker pileup = new UnifiedPileupElementTracker(); - for ( int i = 0; i < reads.size(); i++ ) { - pileup.add(createNewPileupElement( reads.get(i), offset )); + for (GATKSAMRecord read : reads) { + pileup.add(createNewPileupElement(read, offset, BaseUtils.simpleBaseToBaseIndex(read.getReadBases()[offset]) == BaseUtils.D)); } return pileup; } - protected abstract AbstractReadBackedPileup createNewPileup(GenomeLoc loc, PileupElementTracker pileupElementTracker); - protected abstract PE createNewPileupElement(GATKSAMRecord read, int offset); + protected abstract AbstractReadBackedPileup createNewPileup(GenomeLoc loc, PileupElementTracker pileupElementTracker); + + protected abstract PE createNewPileupElement(GATKSAMRecord read, int offset, boolean isDeletion); // -------------------------------------------------------- // @@ -217,32 +221,31 @@ public abstract class AbstractReadBackedPileup 0 ) { - if(pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker)pileupElementTracker; + if (getNumberOfDeletions() > 0) { + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - for(final String sample: tracker.getSamples()) { + for (final String sample : tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); - AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getPileupWithoutDeletions(); - filteredTracker.addElements(sample,pileup.pileupElementTracker); + AbstractReadBackedPileup pileup = createNewPileup(loc, perSampleElements).getPileupWithoutDeletions(); + filteredTracker.addElements(sample, pileup.pileupElementTracker); } - return (RBP)createNewPileup(loc,filteredTracker); + return (RBP) createNewPileup(loc, filteredTracker); - } - else { - UnifiedPileupElementTracker tracker = (UnifiedPileupElementTracker)pileupElementTracker; + } else { + UnifiedPileupElementTracker tracker = (UnifiedPileupElementTracker) pileupElementTracker; UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - for ( PE p : tracker ) { - if ( !p.isDeletion() ) { + for (PE p : tracker) { + if (!p.isDeletion()) { filteredTracker.add(p); } } - return (RBP)createNewPileup(loc, filteredTracker); + return (RBP) createNewPileup(loc, filteredTracker); } } else { - return (RBP)this; + return (RBP) this; } } @@ -256,21 +259,20 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker)pileupElementTracker; + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - for(final String sample: tracker.getSamples()) { + for (final String sample : tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); - AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getOverlappingFragmentFilteredPileup(); - filteredTracker.addElements(sample,pileup.pileupElementTracker); + AbstractReadBackedPileup pileup = createNewPileup(loc, perSampleElements).getOverlappingFragmentFilteredPileup(); + filteredTracker.addElements(sample, pileup.pileupElementTracker); } - return (RBP)createNewPileup(loc,filteredTracker); - } - else { - Map filteredPileup = new HashMap(); + return (RBP) createNewPileup(loc, filteredTracker); + } else { + Map filteredPileup = new HashMap(); - for ( PE p : pileupElementTracker ) { + for (PE p : pileupElementTracker) { String readName = p.getRead().getReadName(); // if we've never seen this read before, life is good @@ -292,10 +294,10 @@ public abstract class AbstractReadBackedPileup filteredTracker = new UnifiedPileupElementTracker(); - for(PE filteredElement: filteredPileup.values()) + for (PE filteredElement : filteredPileup.values()) filteredTracker.add(filteredElement); - return (RBP)createNewPileup(loc,filteredTracker); + return (RBP) createNewPileup(loc, filteredTracker); } } @@ -309,300 +311,299 @@ public abstract class AbstractReadBackedPileup 0 ) { - if(pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker)pileupElementTracker; + if (getNumberOfMappingQualityZeroReads() > 0) { + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - for(final String sample: tracker.getSamples()) { + for (final String sample : tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); - AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getPileupWithoutMappingQualityZeroReads(); - filteredTracker.addElements(sample,pileup.pileupElementTracker); + AbstractReadBackedPileup pileup = createNewPileup(loc, perSampleElements).getPileupWithoutMappingQualityZeroReads(); + filteredTracker.addElements(sample, pileup.pileupElementTracker); } - return (RBP)createNewPileup(loc,filteredTracker); + return (RBP) createNewPileup(loc, filteredTracker); - } - else { - UnifiedPileupElementTracker tracker = (UnifiedPileupElementTracker)pileupElementTracker; + } else { + UnifiedPileupElementTracker tracker = (UnifiedPileupElementTracker) pileupElementTracker; UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - for ( PE p : tracker ) { - if ( p.getRead().getMappingQuality() > 0 ) { + for (PE p : tracker) { + if (p.getRead().getMappingQuality() > 0) { filteredTracker.add(p); } } - return (RBP)createNewPileup(loc, filteredTracker); + return (RBP) createNewPileup(loc, filteredTracker); } } else { - return (RBP)this; + return (RBP) this; } } public RBP getPositiveStrandPileup() { - if(pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker)pileupElementTracker; + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - for(final String sample: tracker.getSamples()) { + for (final String sample : tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); - AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getPositiveStrandPileup(); - filteredTracker.addElements(sample,pileup.pileupElementTracker); + AbstractReadBackedPileup pileup = createNewPileup(loc, perSampleElements).getPositiveStrandPileup(); + filteredTracker.addElements(sample, pileup.pileupElementTracker); } - return (RBP)createNewPileup(loc,filteredTracker); - } - else { - UnifiedPileupElementTracker tracker = (UnifiedPileupElementTracker)pileupElementTracker; + return (RBP) createNewPileup(loc, filteredTracker); + } else { + UnifiedPileupElementTracker tracker = (UnifiedPileupElementTracker) pileupElementTracker; UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - for ( PE p : tracker ) { - if ( !p.getRead().getReadNegativeStrandFlag() ) { + for (PE p : tracker) { + if (!p.getRead().getReadNegativeStrandFlag()) { filteredTracker.add(p); } } - return (RBP)createNewPileup(loc, filteredTracker); + return (RBP) createNewPileup(loc, filteredTracker); } } /** * Gets the pileup consisting of only reads on the negative strand. + * * @return A read-backed pileup consisting only of reads on the negative strand. */ public RBP getNegativeStrandPileup() { - if(pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker)pileupElementTracker; + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - for(final String sample: tracker.getSamples()) { + for (final String sample : tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); - AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getNegativeStrandPileup(); - filteredTracker.addElements(sample,pileup.pileupElementTracker); + AbstractReadBackedPileup pileup = createNewPileup(loc, perSampleElements).getNegativeStrandPileup(); + filteredTracker.addElements(sample, pileup.pileupElementTracker); } - return (RBP)createNewPileup(loc,filteredTracker); - } - else { - UnifiedPileupElementTracker tracker = (UnifiedPileupElementTracker)pileupElementTracker; + return (RBP) createNewPileup(loc, filteredTracker); + } else { + UnifiedPileupElementTracker tracker = (UnifiedPileupElementTracker) pileupElementTracker; UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - for ( PE p : tracker ) { - if ( p.getRead().getReadNegativeStrandFlag() ) { + for (PE p : tracker) { + if (p.getRead().getReadNegativeStrandFlag()) { filteredTracker.add(p); } } - return (RBP)createNewPileup(loc, filteredTracker); + return (RBP) createNewPileup(loc, filteredTracker); } } /** * Gets a pileup consisting of all those elements passed by a given filter. + * * @param filter Filter to use when testing for elements. * @return a pileup without the given filtered elements. */ public RBP getFilteredPileup(PileupElementFilter filter) { - if(pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker)pileupElementTracker; + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - for(final String sample: tracker.getSamples()) { + for (final String sample : tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); - AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getFilteredPileup(filter); - filteredTracker.addElements(sample,pileup.pileupElementTracker); + AbstractReadBackedPileup pileup = createNewPileup(loc, perSampleElements).getFilteredPileup(filter); + filteredTracker.addElements(sample, pileup.pileupElementTracker); } - return (RBP)createNewPileup(loc,filteredTracker); - } - else { + return (RBP) createNewPileup(loc, filteredTracker); + } else { UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - for ( PE p : pileupElementTracker ) { - if( filter.allow(p) ) + for (PE p : pileupElementTracker) { + if (filter.allow(p)) filteredTracker.add(p); } - return (RBP)createNewPileup(loc, filteredTracker); + return (RBP) createNewPileup(loc, filteredTracker); } } - /** Returns subset of this pileup that contains only bases with quality >= minBaseQ, coming from + /** + * Returns subset of this pileup that contains only bases with quality >= minBaseQ, coming from * reads with mapping qualities >= minMapQ. This method allocates and returns a new instance of ReadBackedPileup. + * * @param minBaseQ * @param minMapQ * @return */ @Override - public RBP getBaseAndMappingFilteredPileup( int minBaseQ, int minMapQ ) { - if(pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker)pileupElementTracker; + public RBP getBaseAndMappingFilteredPileup(int minBaseQ, int minMapQ) { + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - for(final String sample: tracker.getSamples()) { + for (final String sample : tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); - AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getBaseAndMappingFilteredPileup(minBaseQ,minMapQ); - filteredTracker.addElements(sample,pileup.pileupElementTracker); + AbstractReadBackedPileup pileup = createNewPileup(loc, perSampleElements).getBaseAndMappingFilteredPileup(minBaseQ, minMapQ); + filteredTracker.addElements(sample, pileup.pileupElementTracker); } - return (RBP)createNewPileup(loc,filteredTracker); - } - else { + return (RBP) createNewPileup(loc, filteredTracker); + } else { UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - for ( PE p : pileupElementTracker ) { - if ( p.getRead().getMappingQuality() >= minMapQ && + for (PE p : pileupElementTracker) { + if (p.getRead().getMappingQuality() >= minMapQ && (p.isDeletion() || - ((p instanceof ExtendedEventPileupElement) && ((ExtendedEventPileupElement)p).getType() == ExtendedEventPileupElement.Type.NOEVENT) || - p.getQual() >= minBaseQ) ) { + ((p instanceof ExtendedEventPileupElement) && ((ExtendedEventPileupElement) p).getType() == ExtendedEventPileupElement.Type.NOEVENT) || + p.getQual() >= minBaseQ)) { filteredTracker.add(p); } } - return (RBP)createNewPileup(loc, filteredTracker); + return (RBP) createNewPileup(loc, filteredTracker); } } - /** Returns subset of this pileup that contains only bases with quality >= minBaseQ. + /** + * Returns subset of this pileup that contains only bases with quality >= minBaseQ. * This method allocates and returns a new instance of ReadBackedPileup. + * * @param minBaseQ * @return */ @Override - public RBP getBaseFilteredPileup( int minBaseQ ) { + public RBP getBaseFilteredPileup(int minBaseQ) { return getBaseAndMappingFilteredPileup(minBaseQ, -1); } - /** Returns subset of this pileup that contains only bases coming from reads with mapping quality >= minMapQ. + /** + * Returns subset of this pileup that contains only bases coming from reads with mapping quality >= minMapQ. * This method allocates and returns a new instance of ReadBackedPileup. + * * @param minMapQ * @return */ @Override - public RBP getMappingFilteredPileup( int minMapQ ) { + public RBP getMappingFilteredPileup(int minMapQ) { return getBaseAndMappingFilteredPileup(-1, minMapQ); } /** * Gets a list of the read groups represented in this pileup. + * * @return */ @Override public Collection getReadGroups() { Set readGroups = new HashSet(); - for(PileupElement pileupElement: this) + for (PileupElement pileupElement : this) readGroups.add(pileupElement.getRead().getReadGroup().getReadGroupId()); return readGroups; } /** * Gets the pileup for a given read group. Horrendously inefficient at this point. + * * @param targetReadGroupId Identifier for the read group. * @return A read-backed pileup containing only the reads in the given read group. */ @Override public RBP getPileupForReadGroup(String targetReadGroupId) { - if(pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker)pileupElementTracker; + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - for(final String sample: tracker.getSamples()) { + for (final String sample : tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); - AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getPileupForReadGroup(targetReadGroupId); - if(pileup != null) - filteredTracker.addElements(sample,pileup.pileupElementTracker); + AbstractReadBackedPileup pileup = createNewPileup(loc, perSampleElements).getPileupForReadGroup(targetReadGroupId); + if (pileup != null) + filteredTracker.addElements(sample, pileup.pileupElementTracker); } - return filteredTracker.size()>0 ? (RBP)createNewPileup(loc,filteredTracker) : null; - } - else { + return filteredTracker.size() > 0 ? (RBP) createNewPileup(loc, filteredTracker) : null; + } else { UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - for(PE p: pileupElementTracker) { + for (PE p : pileupElementTracker) { GATKSAMRecord read = p.getRead(); - if(targetReadGroupId != null) { - if(read.getReadGroup() != null && targetReadGroupId.equals(read.getReadGroup().getReadGroupId())) + if (targetReadGroupId != null) { + if (read.getReadGroup() != null && targetReadGroupId.equals(read.getReadGroup().getReadGroupId())) filteredTracker.add(p); - } - else { - if(read.getReadGroup() == null || read.getReadGroup().getReadGroupId() == null) + } else { + if (read.getReadGroup() == null || read.getReadGroup().getReadGroupId() == null) filteredTracker.add(p); } } - return filteredTracker.size()>0 ? (RBP)createNewPileup(loc,filteredTracker) : null; + return filteredTracker.size() > 0 ? (RBP) createNewPileup(loc, filteredTracker) : null; } } /** * Gets the pileup for a set of read groups. Horrendously inefficient at this point. + * * @param rgSet List of identifiers for the read groups. * @return A read-backed pileup containing only the reads in the given read groups. */ @Override public RBP getPileupForReadGroups(final HashSet rgSet) { - if(pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker)pileupElementTracker; + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - for(final String sample: tracker.getSamples()) { + for (final String sample : tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); - AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getPileupForReadGroups(rgSet); - if(pileup != null) - filteredTracker.addElements(sample,pileup.pileupElementTracker); + AbstractReadBackedPileup pileup = createNewPileup(loc, perSampleElements).getPileupForReadGroups(rgSet); + if (pileup != null) + filteredTracker.addElements(sample, pileup.pileupElementTracker); } - return filteredTracker.size()>0 ? (RBP)createNewPileup(loc,filteredTracker) : null; - } - else { + return filteredTracker.size() > 0 ? (RBP) createNewPileup(loc, filteredTracker) : null; + } else { UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - for(PE p: pileupElementTracker) { + for (PE p : pileupElementTracker) { GATKSAMRecord read = p.getRead(); - if(rgSet != null && !rgSet.isEmpty()) { - if(read.getReadGroup() != null && rgSet.contains(read.getReadGroup().getReadGroupId())) + if (rgSet != null && !rgSet.isEmpty()) { + if (read.getReadGroup() != null && rgSet.contains(read.getReadGroup().getReadGroupId())) filteredTracker.add(p); - } - else { - if(read.getReadGroup() == null || read.getReadGroup().getReadGroupId() == null) + } else { + if (read.getReadGroup() == null || read.getReadGroup().getReadGroupId() == null) filteredTracker.add(p); } } - return filteredTracker.size()>0 ? (RBP)createNewPileup(loc,filteredTracker) : null; + return filteredTracker.size() > 0 ? (RBP) createNewPileup(loc, filteredTracker) : null; } } @Override public RBP getPileupForLane(String laneID) { - if(pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker)pileupElementTracker; + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - for(final String sample: tracker.getSamples()) { + for (final String sample : tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); - AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getPileupForLane(laneID); - if(pileup != null) - filteredTracker.addElements(sample,pileup.pileupElementTracker); + AbstractReadBackedPileup pileup = createNewPileup(loc, perSampleElements).getPileupForLane(laneID); + if (pileup != null) + filteredTracker.addElements(sample, pileup.pileupElementTracker); } - return filteredTracker.size()>0 ? (RBP)createNewPileup(loc,filteredTracker) : null; - } - else { + return filteredTracker.size() > 0 ? (RBP) createNewPileup(loc, filteredTracker) : null; + } else { UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - for(PE p: pileupElementTracker) { + for (PE p : pileupElementTracker) { GATKSAMRecord read = p.getRead(); - if(laneID != null) { - if(read.getReadGroup() != null && - (read.getReadGroup().getReadGroupId().startsWith(laneID + ".")) || // lane is the same, but sample identifier is different - (read.getReadGroup().getReadGroupId().equals(laneID))) // in case there is no sample identifier, they have to be exactly the same + if (laneID != null) { + if (read.getReadGroup() != null && + (read.getReadGroup().getReadGroupId().startsWith(laneID + ".")) || // lane is the same, but sample identifier is different + (read.getReadGroup().getReadGroupId().equals(laneID))) // in case there is no sample identifier, they have to be exactly the same filteredTracker.add(p); - } - else { - if(read.getReadGroup() == null || read.getReadGroup().getReadGroupId() == null) + } else { + if (read.getReadGroup() == null || read.getReadGroup().getReadGroupId() == null) filteredTracker.add(p); } } - return filteredTracker.size()>0 ? (RBP)createNewPileup(loc,filteredTracker) : null; + return filteredTracker.size() > 0 ? (RBP) createNewPileup(loc, filteredTracker) : null; } } public Collection getSamples() { - if(pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker)pileupElementTracker; + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; return new HashSet(tracker.getSamples()); - } - else { + } else { Collection sampleNames = new HashSet(); - for(PileupElement p: this) { + for (PileupElement p : this) { GATKSAMRecord read = p.getRead(); String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; sampleNames.add(sampleName); @@ -619,103 +620,98 @@ public abstract class AbstractReadBackedPileup positions = new TreeSet(); - for ( int i = 0; i < desiredCoverage; /* no update */ ) { - if ( positions.add(GenomeAnalysisEngine.getRandomGenerator().nextInt(size)) ) + for (int i = 0; i < desiredCoverage; /* no update */) { + if (positions.add(GenomeAnalysisEngine.getRandomGenerator().nextInt(size))) i++; } - if(pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker)pileupElementTracker; + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); int current = 0; - for(final String sample: tracker.getSamples()) { + for (final String sample : tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); List filteredPileup = new ArrayList(); - for(PileupElement p: perSampleElements) { - if(positions.contains(current)) + for (PileupElement p : perSampleElements) { + if (positions.contains(current)) filteredPileup.add(p); } - if(!filteredPileup.isEmpty()) { - AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements); - filteredTracker.addElements(sample,pileup.pileupElementTracker); + if (!filteredPileup.isEmpty()) { + AbstractReadBackedPileup pileup = createNewPileup(loc, perSampleElements); + filteredTracker.addElements(sample, pileup.pileupElementTracker); } current++; } - return (RBP)createNewPileup(loc,filteredTracker); - } - else { - UnifiedPileupElementTracker tracker = (UnifiedPileupElementTracker)pileupElementTracker; + return (RBP) createNewPileup(loc, filteredTracker); + } else { + UnifiedPileupElementTracker tracker = (UnifiedPileupElementTracker) pileupElementTracker; UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); Iterator positionIter = positions.iterator(); - while ( positionIter.hasNext() ) { - int nextReadToKeep = (Integer)positionIter.next(); + while (positionIter.hasNext()) { + int nextReadToKeep = (Integer) positionIter.next(); filteredTracker.add(tracker.get(nextReadToKeep)); } - return (RBP)createNewPileup(getLocation(), filteredTracker); + return (RBP) createNewPileup(getLocation(), filteredTracker); } } @Override public RBP getPileupForSamples(Collection sampleNames) { - if(pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker)pileupElementTracker; + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; PileupElementTracker filteredElements = tracker.getElements(sampleNames); - return filteredElements != null ? (RBP)createNewPileup(loc,filteredElements) : null; - } - else { + return filteredElements != null ? (RBP) createNewPileup(loc, filteredElements) : null; + } else { HashSet hashSampleNames = new HashSet(sampleNames); // to speed up the "contains" access in the for loop UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - for(PE p: pileupElementTracker) { + for (PE p : pileupElementTracker) { GATKSAMRecord read = p.getRead(); - if(sampleNames != null) { // still checking on sampleNames because hashSampleNames will never be null. And empty means something else. - if(read.getReadGroup() != null && hashSampleNames.contains(read.getReadGroup().getSample())) + if (sampleNames != null) { // still checking on sampleNames because hashSampleNames will never be null. And empty means something else. + if (read.getReadGroup() != null && hashSampleNames.contains(read.getReadGroup().getSample())) filteredTracker.add(p); - } - else { - if(read.getReadGroup() == null || read.getReadGroup().getSample() == null) + } else { + if (read.getReadGroup() == null || read.getReadGroup().getSample() == null) filteredTracker.add(p); } } - return filteredTracker.size()>0 ? (RBP)createNewPileup(loc,filteredTracker) : null; + return filteredTracker.size() > 0 ? (RBP) createNewPileup(loc, filteredTracker) : null; } } @Override public RBP getPileupForSample(String sampleName) { - if(pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker)pileupElementTracker; + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; PileupElementTracker filteredElements = tracker.getElements(sampleName); - return filteredElements != null ? (RBP)createNewPileup(loc,filteredElements) : null; - } - else { + return filteredElements != null ? (RBP) createNewPileup(loc, filteredElements) : null; + } else { UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - for(PE p: pileupElementTracker) { + for (PE p : pileupElementTracker) { GATKSAMRecord read = p.getRead(); - if(sampleName != null) { - if(read.getReadGroup() != null && sampleName.equals(read.getReadGroup().getSample())) + if (sampleName != null) { + if (read.getReadGroup() != null && sampleName.equals(read.getReadGroup().getSample())) filteredTracker.add(p); - } - else { - if(read.getReadGroup() == null || read.getReadGroup().getSample() == null) + } else { + if (read.getReadGroup() == null || read.getReadGroup().getSample() == null) filteredTracker.add(p); } } - return filteredTracker.size()>0 ? (RBP)createNewPileup(loc,filteredTracker) : null; + return filteredTracker.size() > 0 ? (RBP) createNewPileup(loc, filteredTracker) : null; } } @@ -727,9 +723,9 @@ public abstract class AbstractReadBackedPileup * for (PileupElement p : this) { doSomething(p); } - * + *

* Provides efficient iteration of the data. * * @return @@ -739,9 +735,17 @@ public abstract class AbstractReadBackedPileup() { private final Iterator wrappedIterator = pileupElementTracker.iterator(); - public boolean hasNext() { return wrappedIterator.hasNext(); } - public PileupElement next() { return wrappedIterator.next(); } - public void remove() { throw new UnsupportedOperationException("Cannot remove from a pileup element iterator"); } + public boolean hasNext() { + return wrappedIterator.hasNext(); + } + + public PileupElement next() { + return wrappedIterator.next(); + } + + public void remove() { + throw new UnsupportedOperationException("Cannot remove from a pileup element iterator"); + } }; } @@ -784,7 +788,7 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker)pileupElementTracker; - for(final String sample: tracker.getSamples()) { - int[] countsBySample = createNewPileup(loc,tracker.getElements(sample)).getBaseCounts(); - for(int i = 0; i < counts.length; i++) + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; + for (final String sample : tracker.getSamples()) { + int[] countsBySample = createNewPileup(loc, tracker.getElements(sample)).getBaseCounts(); + for (int i = 0; i < counts.length; i++) counts[i] += countsBySample[i]; } - } - else { - for ( PileupElement pile : this ) { + } else { + for (PileupElement pile : this) { // skip deletion sites - if ( ! pile.isDeletion() ) { - int index = BaseUtils.simpleBaseToBaseIndex((char)pile.getBase()); + if (!pile.isDeletion()) { + int index = BaseUtils.simpleBaseToBaseIndex((char) pile.getBase()); if (index != -1) counts[index]++; } @@ -857,65 +860,80 @@ public abstract class AbstractReadBackedPileup getReads() { List reads = new ArrayList(getNumberOfElements()); - for ( PileupElement pile : this ) { reads.add(pile.getRead()); } + for (PileupElement pile : this) { + reads.add(pile.getRead()); + } return reads; } /** * Returns a list of the offsets in this pileup. Note this call costs O(n) and allocates fresh lists each time + * * @return */ @Override public List getOffsets() { List offsets = new ArrayList(getNumberOfElements()); - for ( PileupElement pile : this ) { offsets.add(pile.getOffset()); } + for (PileupElement pile : this) { + offsets.add(pile.getOffset()); + } return offsets; } /** * Returns an array of the bases in this pileup. Note this call costs O(n) and allocates fresh array each time + * * @return */ @Override public byte[] getBases() { byte[] v = new byte[getNumberOfElements()]; int pos = 0; - for ( PileupElement pile : pileupElementTracker ) { v[pos++] = pile.getBase(); } + for (PileupElement pile : pileupElementTracker) { + v[pos++] = pile.getBase(); + } return v; } /** * Returns an array of the quals in this pileup. Note this call costs O(n) and allocates fresh array each time + * * @return */ @Override public byte[] getQuals() { byte[] v = new byte[getNumberOfElements()]; int pos = 0; - for ( PileupElement pile : pileupElementTracker ) { v[pos++] = pile.getQual(); } + for (PileupElement pile : pileupElementTracker) { + v[pos++] = pile.getQual(); + } return v; } /** * Get an array of the mapping qualities + * * @return */ @Override public byte[] getMappingQuals() { byte[] v = new byte[getNumberOfElements()]; int pos = 0; - for ( PileupElement pile : pileupElementTracker ) { v[pos++] = (byte)pile.getRead().getMappingQuality(); } + for (PileupElement pile : pileupElementTracker) { + v[pos++] = (byte) pile.getRead().getMappingQuality(); + } return v; } - static String quals2String( byte[] quals ) { + static String quals2String(byte[] quals) { StringBuilder qualStr = new StringBuilder(); - for ( int qual : quals ) { + for (int qual : quals) { qual = Math.min(qual, 63); // todo: fixme, this isn't a good idea char qualChar = (char) (33 + qual); // todo: warning, this is illegal for qual > 63 qualStr.append(qualChar); diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ExtendedEventPileupElement.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ExtendedEventPileupElement.java index 1e5e4d4e5..1d7e6f636 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ExtendedEventPileupElement.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ExtendedEventPileupElement.java @@ -12,7 +12,7 @@ import java.util.Arrays; * are seen on the base-by-base basis (i.e. the pileup does keep the information about the current reference base being deleted * in some reads), but the information about the extended event (deletion length, string of all deleted bases) is not kept. * The insertions that may be present in some reads are not seen at all in such strict reference traversal mode. - * + *

* By convention, any extended event (indel) is mapped onto the reference at the last base prior to the event (i.e. * last base before the insertion or deletion). If the special "extended" traversal mode is turned on and there is * an indel in at least one read that maps onto the reference position Z, the walker's map function will be called twice: @@ -22,9 +22,9 @@ import java.util.Arrays; * (covered) reference position. Note that if the extended event at Z was a deletion, the "standard" base pileup at * Z+1 and following bases may still contain deleted bases. However the fully extended event call will be performed * only once, at the position where the indel maps (starts). - * + *

* This class wraps an "extended" event (indel) so that in can be added to a pileup of events at a given location. - * + *

* Created by IntelliJ IDEA. * User: asivache * Date: Dec 21, 2009 @@ -39,40 +39,52 @@ public class ExtendedEventPileupElement extends PileupElement { private Type type = null; private int eventLength = -1; private String eventBases = null; // if it is a deletion, we do not have information about the actual deleted bases - // in the read itself, so we fill the string with D's; for insertions we keep actual inserted bases + // in the read itself, so we fill the string with D's; for insertions we keep actual inserted bases private SAMRecord read; private int offset; // position in the read immediately BEFORE the event // This is broken! offset is always zero because these member variables are shadowed by base class - /** Constructor for extended pileup element (indel). - * - * @param read the read, in which the indel is observed - * @param offset position in the read immediately before the indel (can be -1 if read starts with an insertion) - * @param length length of the indel (number of inserted or deleted bases); length <=0 indicates that the read has no indel (NOEVENT) - * @param eventBases inserted bases. null indicates that the event is a deletion; ignored if length<=0 (noevent) - */ - public ExtendedEventPileupElement( GATKSAMRecord read, int offset, int length, byte[] eventBases ) { - super(read, offset); - this.eventLength = length; - if ( length <= 0 ) type = Type.NOEVENT; - else { - if ( eventBases != null ) { - this.eventBases = new String(eventBases).toUpperCase(); - type = Type.INSERTION; - } else { - type = Type.DELETION; - } - } + + public ExtendedEventPileupElement(GATKSAMRecord read, int offset, int eventLength, String eventBases, Type type) { + super(read, offset, type == Type.DELETION); + this.read = read; + this.offset = offset; + this.eventLength = eventLength; + this.eventBases = eventBases; + this.type = type; } - /** Constructor for deletion or noevent calls - does not take event bases as an argument (as those should - * be null or are ignored in these cases anyway) - * @param read - * @param offset - * @param length + /** + * Quick constructor for insertions. + * + * @param read the read, in which the indel is observed + * @param offset position in the read immediately before the indel (can be -1 if read starts with an insertion) + * @param length length of the indel (number of inserted or deleted bases); length <=0 indicates that the read has no indel (NOEVENT) + * @param eventBases inserted bases. null indicates that the event is a deletion; ignored if length<=0 (noevent) */ - public ExtendedEventPileupElement( GATKSAMRecord read, int offset, int length ) { - this(read,offset, length, null); + public ExtendedEventPileupElement(GATKSAMRecord read, int offset, int length, byte[] eventBases) { + this(read, offset, length, new String(eventBases).toUpperCase(), Type.INSERTION); + } + + /** + * Quick constructor for non indels (matches) + * + * @param read the read + * @param offset where in the read the match is + */ + public ExtendedEventPileupElement(GATKSAMRecord read, int offset) { + this(read, offset, -1, null, Type.NOEVENT); + } + + /** + * Quick constructor for deletions + * + * @param read the read + * @param offset the last base before the deletion starts (left aligned deletion) + * @param length length of this deletion + */ + public ExtendedEventPileupElement(GATKSAMRecord read, int offset, int length) { + this(read, offset, length, null, Type.DELETION); } public boolean isDeletion() { @@ -87,46 +99,54 @@ public class ExtendedEventPileupElement extends PileupElement { return isDeletion() || isInsertion(); } - public Type getType() { return type; } + public Type getType() { + return type; + } // The offset can be negative with insertions at the start of the read, but a valid base does exist at this position with // a valid base quality. The following code attempts to compensate for that.' @Override public byte getBase() { - return getBase(offset >= 0 ? offset : offset+eventLength); + return getBase(offset >= 0 ? offset : offset + eventLength); } @Override public int getBaseIndex() { - return getBaseIndex(offset >= 0 ? offset : offset+eventLength); + return getBaseIndex(offset >= 0 ? offset : offset + eventLength); } @Override public byte getQual() { - return getQual(offset >= 0 ? offset : offset+eventLength); + return getQual(offset >= 0 ? offset : offset + eventLength); } - /** Returns length of the event (number of inserted or deleted bases */ - public int getEventLength() { return eventLength; } + /** + * Returns length of the event (number of inserted or deleted bases + */ + public int getEventLength() { + return eventLength; + } - /** Returns actual sequence of inserted bases, or a null if the event is a deletion or if there is no event in the associated read. - * */ - public String getEventBases() { return eventBases; } + /** + * Returns actual sequence of inserted bases, or a null if the event is a deletion or if there is no event in the associated read. + */ + public String getEventBases() { + return eventBases; + } @Override public String toString() { char c = '.'; - String fillStr = null ; - if ( isDeletion() ) { + String fillStr = null; + if (isDeletion()) { c = '-'; - char [] filler = new char[eventLength]; + char[] filler = new char[eventLength]; Arrays.fill(filler, 'D'); fillStr = new String(filler); - } - else if ( isInsertion() ) c = '+'; - return String.format("%s @ %d = %c%s MQ%d", getRead().getReadName(), getOffset(), c, isIndel()? - (isInsertion() ? eventBases : fillStr ): "", getMappingQual()); + } else if (isInsertion()) c = '+'; + return String.format("%s @ %d = %c%s MQ%d", getRead().getReadName(), getOffset(), c, isIndel() ? + (isInsertion() ? eventBases : fillStr) : "", getMappingQual()); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java index 2d13d6e59..73f010d40 100755 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.utils.pileup; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; /** @@ -21,25 +22,61 @@ public class PileupElement implements Comparable { protected final GATKSAMRecord read; protected final int offset; + protected final boolean isDeletion; + + /** + * Creates a new pileup element. + * + * @param read the read we are adding to the pileup + * @param offset the position in the read for this base. All deletions must be left aligned! (-1 is only allowed for reads starting with insertions) + * @param isDeletion whether or not this base is a deletion + */ @Requires({ "read != null", "offset >= -1", "offset <= read.getReadLength()"}) - public PileupElement( GATKSAMRecord read, int offset ) { + public PileupElement(GATKSAMRecord read, int offset, boolean isDeletion) { + if (offset < 0 && isDeletion) + throw new ReviewedStingException("Pileup Element cannot create a deletion with a negative offset"); + this.read = read; this.offset = offset; + this.isDeletion = isDeletion; } + // /** +// * Creates a NON DELETION pileup element. +// * +// * use this constructor only for insertions and matches/mismatches. +// * @param read the read we are adding to the pileup +// * @param offset the position in the read for this base. All deletions must be left aligned! (-1 is only allowed for reads starting with insertions) +// */ +// @Requires({ +// "read != null", +// "offset >= -1", +// "offset <= read.getReadLength()"}) +// public PileupElement( GATKSAMRecord read, int offset ) { +// this(read, offset, false); +// } +// public boolean isDeletion() { + return isDeletion; + } + + public boolean isInsertionAtBeginningOfRead() { return offset == -1; } @Ensures("result != null") - public GATKSAMRecord getRead() { return read; } + public GATKSAMRecord getRead() { + return read; + } @Ensures("result == offset") - public int getOffset() { return offset; } + public int getOffset() { + return offset; + } public byte getBase() { return getBase(offset); @@ -59,30 +96,30 @@ public class PileupElement implements Comparable { @Ensures("result != null") public String toString() { - return String.format("%s @ %d = %c Q%d", getRead().getReadName(), getOffset(), (char)getBase(), getQual()); + return String.format("%s @ %d = %c Q%d", getRead().getReadName(), getOffset(), (char) getBase(), getQual()); } protected byte getBase(final int offset) { - return isDeletion() ? DELETION_BASE : read.getReadBases()[offset]; + return (isDeletion() || isInsertionAtBeginningOfRead()) ? DELETION_BASE : read.getReadBases()[offset]; } protected int getBaseIndex(final int offset) { - return BaseUtils.simpleBaseToBaseIndex(isDeletion() ? DELETION_BASE : read.getReadBases()[offset]); + return BaseUtils.simpleBaseToBaseIndex((isDeletion() || isInsertionAtBeginningOfRead()) ? DELETION_BASE : read.getReadBases()[offset]); } protected byte getQual(final int offset) { - return isDeletion() ? DELETION_QUAL : read.getBaseQualities()[offset]; + return (isDeletion() || isInsertionAtBeginningOfRead()) ? DELETION_QUAL : read.getBaseQualities()[offset]; } @Override public int compareTo(final PileupElement pileupElement) { - if ( offset < pileupElement.offset ) + if (offset < pileupElement.offset) return -1; - else if ( offset > pileupElement.offset ) + else if (offset > pileupElement.offset) return 1; - else if ( read.getAlignmentStart() < pileupElement.read.getAlignmentStart() ) + else if (read.getAlignmentStart() < pileupElement.read.getAlignmentStart()) return -1; - else if ( read.getAlignmentStart() > pileupElement.read.getAlignmentStart() ) + else if (read.getAlignmentStart() > pileupElement.read.getAlignmentStart()) return 1; else return 0; @@ -94,13 +131,26 @@ public class PileupElement implements Comparable { // // -------------------------------------------------------------------------- - public boolean isReducedRead() { - return read.isReducedRead(); - } +// public boolean isReducedRead() { +// return read.isReducedRead(); +// } + /** + * Returns the number of elements in the pileup element. + *

+ * Unless this is a reduced read, the number of elements in a pileup element is one. In the event of + * this being a reduced read and a deletion, we return the average number of elements between the left + * and right elements to the deletion. We assume the deletion to be left aligned. + * + * @return + */ public int getRepresentativeCount() { - // TODO -- if we ever decide to reduce the representation of deletions then this will need to be fixed - return (!isDeletion() && isReducedRead()) ? read.getReducedCount(offset) : 1; + int representativeCount = 1; + + if (read.isReducedRead() && !isInsertionAtBeginningOfRead()) + representativeCount = (isDeletion()) ? Math.round((read.getReducedCount(offset) + read.getReducedCount(offset + 1)) / 2) : read.getReducedCount(offset); + + return representativeCount; } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileupImpl.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileupImpl.java index 43ad06352..bf67d1a70 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileupImpl.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileupImpl.java @@ -30,33 +30,34 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.*; -public class ReadBackedExtendedEventPileupImpl extends AbstractReadBackedPileup implements ReadBackedExtendedEventPileup { +public class ReadBackedExtendedEventPileupImpl extends AbstractReadBackedPileup implements ReadBackedExtendedEventPileup { private int nInsertions; private int maxDeletionLength; // cached value of the length of the longest deletion observed at the site public ReadBackedExtendedEventPileupImpl(GenomeLoc loc, List pileupElements) { - super(loc,pileupElements); + super(loc, pileupElements); } public ReadBackedExtendedEventPileupImpl(GenomeLoc loc, PileupElementTracker tracker) { - super(loc,tracker); + super(loc, tracker); } /** * Optimization of above constructor where all of the cached data is provided + * * @param loc * @param pileup */ public ReadBackedExtendedEventPileupImpl(GenomeLoc loc, List pileup, int size, - int maxDeletionLength, int nInsertions, int nDeletions, int nMQ0Reads) { - super(loc,pileup,size,nDeletions,nMQ0Reads); + int maxDeletionLength, int nInsertions, int nDeletions, int nMQ0Reads) { + super(loc, pileup, size, nDeletions, nMQ0Reads); this.maxDeletionLength = maxDeletionLength; this.nInsertions = nInsertions; } // this is the good new one - public ReadBackedExtendedEventPileupImpl(GenomeLoc loc, Map pileupElementsBySample) { - super(loc,pileupElementsBySample); + public ReadBackedExtendedEventPileupImpl(GenomeLoc loc, Map pileupElementsBySample) { + super(loc, pileupElementsBySample); } /** @@ -71,31 +72,31 @@ public class ReadBackedExtendedEventPileupImpl extends AbstractReadBackedPileup< nInsertions = 0; nMQ0Reads = 0; - for ( ExtendedEventPileupElement p : this.toExtendedIterable() ) { + for (ExtendedEventPileupElement p : this.toExtendedIterable()) { - if ( p.isDeletion() ) { + if (p.isDeletion()) { maxDeletionLength = Math.max(maxDeletionLength, p.getEventLength()); } else { - if ( p.isInsertion() ) nInsertions++; + if (p.isInsertion()) nInsertions++; } } } @Override - protected void addPileupToCumulativeStats(AbstractReadBackedPileup pileup) { + protected void addPileupToCumulativeStats(AbstractReadBackedPileup pileup) { super.addPileupToCumulativeStats(pileup); - ReadBackedExtendedEventPileup extendedEventPileup = ((ReadBackedExtendedEventPileup)pileup); + ReadBackedExtendedEventPileup extendedEventPileup = ((ReadBackedExtendedEventPileup) pileup); this.nInsertions += extendedEventPileup.getNumberOfInsertions(); this.maxDeletionLength += extendedEventPileup.getMaxDeletionLength(); } @Override protected ReadBackedExtendedEventPileupImpl createNewPileup(GenomeLoc loc, PileupElementTracker tracker) { - return new ReadBackedExtendedEventPileupImpl(loc,tracker); + return new ReadBackedExtendedEventPileupImpl(loc, tracker); } @Override - protected ExtendedEventPileupElement createNewPileupElement(GATKSAMRecord read, int offset) { + protected ExtendedEventPileupElement createNewPileupElement(GATKSAMRecord read, int offset, boolean isDeletion) { throw new UnsupportedOperationException("Not enough information provided to create a new pileup element"); } @@ -110,10 +111,12 @@ public class ReadBackedExtendedEventPileupImpl extends AbstractReadBackedPileup< return nInsertions; } - /** Returns the length of the longest deletion observed at the site this + /** + * Returns the length of the longest deletion observed at the site this * pileup is associated with (NOTE: by convention, both insertions and deletions * are associated with genomic location immediately before the actual event). If * there are no deletions at the site, returns 0. + * * @return */ @Override @@ -123,36 +126,47 @@ public class ReadBackedExtendedEventPileupImpl extends AbstractReadBackedPileup< public Iterable toExtendedIterable() { return new Iterable() { - public Iterator iterator() { return pileupElementTracker.iterator(); } + public Iterator iterator() { + return pileupElementTracker.iterator(); + } }; } /** * Returns an array of the events in this pileup ('I', 'D', or '.'). Note this call costs O(n) and allocates fresh array each time + * * @return */ @Override public byte[] getEvents() { byte[] v = new byte[getNumberOfElements()]; int i = 0; - for ( ExtendedEventPileupElement e : this.toExtendedIterable() ) { - switch ( e.getType() ) { - case INSERTION: v[i] = 'I'; break; - case DELETION: v[i] = 'D'; break; - case NOEVENT: v[i] = '.'; break; - default: throw new ReviewedStingException("Unknown event type encountered: "+e.getType()); + for (ExtendedEventPileupElement e : this.toExtendedIterable()) { + switch (e.getType()) { + case INSERTION: + v[i] = 'I'; + break; + case DELETION: + v[i] = 'D'; + break; + case NOEVENT: + v[i] = '.'; + break; + default: + throw new ReviewedStingException("Unknown event type encountered: " + e.getType()); } i++; } return v; - } + } - /** A shortcut for getEventStringsWithCounts(null); + /** + * A shortcut for getEventStringsWithCounts(null); * * @return */ @Override - public List> getEventStringsWithCounts() { + public List> getEventStringsWithCounts() { return getEventStringsWithCounts(null); } @@ -163,44 +177,48 @@ public class ReadBackedExtendedEventPileupImpl extends AbstractReadBackedPileup< // insertion, deletion or no-event, respectively. return String.format("%s %s E %s", getLocation().getContig(), getLocation().getStart(), // chromosome name and coordinate - new String(getEvents()) ); + new String(getEvents())); } - /** Returns String representation of all distinct extended events (indels) at the site along with + /** + * Returns String representation of all distinct extended events (indels) at the site along with * observation counts (numbers of reads) for each distinct event. If refBases is null, a simple string representation for * deletions will be generated as "D" (i.e. "5D"); if the reference bases are provided, the actual * deleted sequence will be used in the string representation (e.g. "-AAC"). - * @param refBases reference bases, starting with the current locus (i.e. the one immediately before the indel), and - * extending far enough to accomodate the longest deletion (i.e. size of refBases must be at least 1+) + * + * @param refBases reference bases, starting with the current locus (i.e. the one immediately before the indel), and + * extending far enough to accomodate the longest deletion (i.e. size of refBases must be at least 1+) * @return list of distinct events; first element of a pair is a string representation of the event, second element - * gives the number of reads, in which that event was observed + * gives the number of reads, in which that event was observed */ @Override - public List> getEventStringsWithCounts(byte[] refBases) { - Map events = new HashMap(); + public List> getEventStringsWithCounts(byte[] refBases) { + Map events = new HashMap(); - for ( ExtendedEventPileupElement e : this.toExtendedIterable() ) { + for (ExtendedEventPileupElement e : this.toExtendedIterable()) { Integer cnt; String indel = null; - switch ( e.getType() ) { + switch (e.getType()) { case INSERTION: - indel = "+"+e.getEventBases(); + indel = "+" + e.getEventBases(); break; case DELETION: - indel = getDeletionString(e.getEventLength(),refBases); + indel = getDeletionString(e.getEventLength(), refBases); break; - case NOEVENT: continue; - default: throw new ReviewedStingException("Unknown event type encountered: "+e.getType()); + case NOEVENT: + continue; + default: + throw new ReviewedStingException("Unknown event type encountered: " + e.getType()); } cnt = events.get(indel); - if ( cnt == null ) events.put(indel,1); - else events.put(indel,cnt.intValue()+1); + if (cnt == null) events.put(indel, 1); + else events.put(indel, cnt.intValue() + 1); } - List> eventList = new ArrayList>(events.size()); - for ( Map.Entry m : events.entrySet() ) { - eventList.add( new Pair(m.getKey(),m.getValue())); + List> eventList = new ArrayList>(events.size()); + for (Map.Entry m : events.entrySet()) { + eventList.add(new Pair(m.getKey(), m.getValue())); } return eventList; } @@ -208,18 +226,19 @@ public class ReadBackedExtendedEventPileupImpl extends AbstractReadBackedPileup< /** * Builds string representation of the deletion event. If refBases is null, the representation will be * "D" (e.g. "5D"); if the reference bases are provided, a verbose representation (e.g. "-AAC") - * will be generated. NOTE: refBases must start with the base prior to the actual deletion (i.e. deleted + * will be generated. NOTE: refBases must start with the base prior to the actual deletion (i.e. deleted * base(s) are refBase[1], refBase[2], ...), and the length of the passed array must be sufficient to accomodate the * deletion length (i.e. size of refBase must be at least length+1). + * * @param length * @param refBases * @return */ private String getDeletionString(int length, byte[] refBases) { - if ( refBases == null ) { - return Integer.toString(length)+"D"; // if we do not have reference bases, we can only report something like "5D" + if (refBases == null) { + return Integer.toString(length) + "D"; // if we do not have reference bases, we can only report something like "5D" } else { - return "-"+new String(refBases,1,length).toUpperCase(); + return "-" + new String(refBases, 1, length).toUpperCase(); } } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java index b7445be8d..66ddbe95d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java @@ -29,48 +29,49 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.List; import java.util.Map; -public class ReadBackedPileupImpl extends AbstractReadBackedPileup implements ReadBackedPileup { +public class ReadBackedPileupImpl extends AbstractReadBackedPileup implements ReadBackedPileup { public ReadBackedPileupImpl(GenomeLoc loc) { super(loc); } - public ReadBackedPileupImpl(GenomeLoc loc, List reads, List offsets ) { - super(loc,reads,offsets); + public ReadBackedPileupImpl(GenomeLoc loc, List reads, List offsets) { + super(loc, reads, offsets); } - public ReadBackedPileupImpl(GenomeLoc loc, List reads, int offset ) { - super(loc,reads,offset); + public ReadBackedPileupImpl(GenomeLoc loc, List reads, int offset) { + super(loc, reads, offset); } public ReadBackedPileupImpl(GenomeLoc loc, List pileupElements) { - super(loc,pileupElements); + super(loc, pileupElements); } - public ReadBackedPileupImpl(GenomeLoc loc, Map pileupElementsBySample) { - super(loc,pileupElementsBySample); + public ReadBackedPileupImpl(GenomeLoc loc, Map pileupElementsBySample) { + super(loc, pileupElementsBySample); } /** * Optimization of above constructor where all of the cached data is provided + * * @param loc * @param pileup */ public ReadBackedPileupImpl(GenomeLoc loc, List pileup, int size, int nDeletions, int nMQ0Reads) { - super(loc,pileup,size,nDeletions,nMQ0Reads); + super(loc, pileup, size, nDeletions, nMQ0Reads); } protected ReadBackedPileupImpl(GenomeLoc loc, PileupElementTracker tracker) { - super(loc,tracker); + super(loc, tracker); } @Override protected ReadBackedPileupImpl createNewPileup(GenomeLoc loc, PileupElementTracker tracker) { - return new ReadBackedPileupImpl(loc,tracker); + return new ReadBackedPileupImpl(loc, tracker); } @Override - protected PileupElement createNewPileupElement(GATKSAMRecord read, int offset) { - return new PileupElement(read,offset); + protected PileupElement createNewPileupElement(GATKSAMRecord read, int offset, boolean isDeletion) { + return new PileupElement(read, offset, isDeletion); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java index b8e892101..3b2736418 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java @@ -1,745 +1,774 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.sam; - -import net.sf.samtools.Cigar; -import net.sf.samtools.CigarElement; -import net.sf.samtools.CigarOperator; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.BitSet; - - -public class AlignmentUtils { - - public static class MismatchCount { - public int numMismatches = 0; - public long mismatchQualities = 0; - } - - public static long mismatchingQualities(SAMRecord r, byte[] refSeq, int refIndex) { - return getMismatchCount(r, refSeq, refIndex).mismatchQualities; - } - - public static MismatchCount getMismatchCount(SAMRecord r, byte[] refSeq, int refIndex) { - return getMismatchCount(r,refSeq,refIndex,0,r.getReadLength()); - } - - // todo -- this code and mismatchesInRefWindow should be combined and optimized into a single - // todo -- high performance implementation. We can do a lot better than this right now - public static MismatchCount getMismatchCount(SAMRecord r, byte[] refSeq, int refIndex, int startOnRead, int nReadBases) { - MismatchCount mc = new MismatchCount(); - - int readIdx = 0; - int endOnRead = startOnRead + nReadBases - 1; // index of the last base on read we want to count - byte[] readSeq = r.getReadBases(); - Cigar c = r.getCigar(); - for (int i = 0 ; i < c.numCigarElements() ; i++) { - - if ( readIdx > endOnRead ) break; - - CigarElement ce = c.getCigarElement(i); - switch ( ce.getOperator() ) { - case M: - for (int j = 0 ; j < ce.getLength() ; j++, refIndex++, readIdx++ ) { - if ( refIndex >= refSeq.length ) - continue; - if ( readIdx < startOnRead ) continue; - if ( readIdx > endOnRead ) break; - byte refChr = refSeq[refIndex]; - byte readChr = readSeq[readIdx]; - // Note: we need to count X/N's as mismatches because that's what SAM requires - //if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 || - // BaseUtils.simpleBaseToBaseIndex(refChr) == -1 ) - // continue; // do not count Ns/Xs/etc ? - if ( readChr != refChr ) { - mc.numMismatches++; - mc.mismatchQualities += r.getBaseQualities()[readIdx]; - } - } - break; - case I: - case S: - readIdx += ce.getLength(); - break; - case D: - case N: - refIndex += ce.getLength(); - break; - case H: - case P: - break; - default: throw new ReviewedStingException("The " + ce.getOperator() + " cigar element is not currently supported"); - } - - } - return mc; - } - - /** Returns the number of mismatches in the pileup within the given reference context. - * - * @param pileup the pileup with reads - * @param ref the reference context - * @param ignoreTargetSite if true, ignore mismatches at the target locus (i.e. the center of the window) - * @return the number of mismatches - */ - public static int mismatchesInRefWindow(ReadBackedPileup pileup, ReferenceContext ref, boolean ignoreTargetSite) { - int mismatches = 0; - for ( PileupElement p : pileup ) - mismatches += mismatchesInRefWindow(p, ref, ignoreTargetSite); - return mismatches; - } - - /** Returns the number of mismatches in the pileup element within the given reference context. - * - * @param p the pileup element - * @param ref the reference context - * @param ignoreTargetSite if true, ignore mismatches at the target locus (i.e. the center of the window) - * @return the number of mismatches - */ - public static int mismatchesInRefWindow(PileupElement p, ReferenceContext ref, boolean ignoreTargetSite) { - return mismatchesInRefWindow(p, ref, ignoreTargetSite, false); - } - - /** Returns the number of mismatches in the pileup element within the given reference context. - * - * @param p the pileup element - * @param ref the reference context - * @param ignoreTargetSite if true, ignore mismatches at the target locus (i.e. the center of the window) - * @param qualitySumInsteadOfMismatchCount if true, return the quality score sum of the mismatches rather than the count - * @return the number of mismatches - */ - public static int mismatchesInRefWindow(PileupElement p, ReferenceContext ref, boolean ignoreTargetSite, boolean qualitySumInsteadOfMismatchCount) { - int sum = 0; - - int windowStart = ref.getWindow().getStart(); - int windowStop = ref.getWindow().getStop(); - byte[] refBases = ref.getBases(); - byte[] readBases = p.getRead().getReadBases(); - byte[] readQualities = p.getRead().getBaseQualities(); - Cigar c = p.getRead().getCigar(); - - int readIndex = 0; - int currentPos = p.getRead().getAlignmentStart(); - int refIndex = Math.max(0, currentPos - windowStart); - - for (int i = 0 ; i < c.numCigarElements() ; i++) { - CigarElement ce = c.getCigarElement(i); - int cigarElementLength = ce.getLength(); - switch ( ce.getOperator() ) { - case M: - for (int j = 0; j < cigarElementLength; j++, readIndex++, currentPos++) { - // are we past the ref window? - if ( currentPos > windowStop ) - break; - - // are we before the ref window? - if ( currentPos < windowStart ) - continue; - - byte refChr = refBases[refIndex++]; - - // do we need to skip the target site? - if ( ignoreTargetSite && ref.getLocus().getStart() == currentPos ) - continue; - - byte readChr = readBases[readIndex]; - if ( readChr != refChr ) - sum += (qualitySumInsteadOfMismatchCount) ? readQualities[readIndex] : 1; - } - break; - case I: - case S: - readIndex += cigarElementLength; - break; - case D: - case N: - currentPos += cigarElementLength; - if ( currentPos > windowStart ) - refIndex += Math.min(cigarElementLength, currentPos - windowStart); - break; - case H: - case P: - break; - } - } - - return sum; - } - - /** Returns the number of mismatches in the pileup element within the given reference context. - * - * @param read the SAMRecord - * @param ref the reference context - * @param maxMismatches the maximum number of surrounding mismatches we tolerate to consider a base good - * @param windowSize window size (on each side) to test - * @return a bitset representing which bases are good - */ - public static BitSet mismatchesInRefWindow(SAMRecord read, ReferenceContext ref, int maxMismatches, int windowSize) { - // first determine the positions with mismatches - int readLength = read.getReadLength(); - BitSet mismatches = new BitSet(readLength); - - // it's possible we aren't starting at the beginning of a read, - // and we don't need to look at any of the previous context outside our window - // (although we do need future context) - int readStartPos = Math.max(read.getAlignmentStart(), ref.getLocus().getStart() - windowSize); - int currentReadPos = read.getAlignmentStart(); - - byte[] refBases = ref.getBases(); - int refIndex = readStartPos - ref.getWindow().getStart(); - if ( refIndex < 0 ) { - throw new IllegalStateException("When calculating mismatches, we somehow don't have enough previous reference context for read " + read.getReadName() + " at position " + ref.getLocus()); - } - - byte[] readBases = read.getReadBases(); - int readIndex = 0; - - Cigar c = read.getCigar(); - - for (int i = 0 ; i < c.numCigarElements() ; i++) { - CigarElement ce = c.getCigarElement(i); - int cigarElementLength = ce.getLength(); - switch ( ce.getOperator() ) { - case M: - for (int j = 0; j < cigarElementLength; j++, readIndex++) { - // skip over unwanted bases - if ( currentReadPos++ < readStartPos ) - continue; - - // this is possible if reads extend beyond the contig end - if ( refIndex >= refBases.length ) - break; - - byte refChr = refBases[refIndex]; - byte readChr = readBases[readIndex]; - if ( readChr != refChr ) - mismatches.set(readIndex); - - refIndex++; - } - break; - case I: - case S: - readIndex += cigarElementLength; - break; - case D: - case N: - if ( currentReadPos >= readStartPos ) - refIndex += cigarElementLength; - currentReadPos += cigarElementLength; - break; - case H: - case P: - break; - } - } - - // all bits are set to false by default - BitSet result = new BitSet(readLength); - - int currentPos = 0, leftPos = 0, rightPos; - int mismatchCount = 0; - - // calculate how many mismatches exist in the windows to the left/right - for ( rightPos = 1; rightPos <= windowSize && rightPos < readLength; rightPos++) { - if ( mismatches.get(rightPos) ) - mismatchCount++; - } - if ( mismatchCount <= maxMismatches ) - result.set(currentPos); - - // now, traverse over the read positions - while ( currentPos < readLength ) { - // add a new rightmost position - if ( rightPos < readLength && mismatches.get(rightPos++) ) - mismatchCount++; - // re-penalize the previous position - if ( mismatches.get(currentPos++) ) - mismatchCount++; - // don't penalize the current position - if ( mismatches.get(currentPos) ) - mismatchCount--; - // subtract the leftmost position - if ( leftPos < currentPos - windowSize && mismatches.get(leftPos++) ) - mismatchCount--; - - if ( mismatchCount <= maxMismatches ) - result.set(currentPos); - } - - return result; - } - /** Returns number of alignment blocks (continuous stretches of aligned bases) in the specified alignment. - * This method follows closely the SAMRecord::getAlignmentBlocks() implemented in samtools library, but - * it only counts blocks without actually allocating and filling the list of blocks themselves. Hence, this method is - * a much more efficient alternative to r.getAlignmentBlocks.size() in the situations when this number is all that is needed. - * Formally, this method simply returns the number of M elements in the cigar. - * @param r alignment - * @return number of continuous alignment blocks (i.e. 'M' elements of the cigar; all indel and clipping elements are ignored). - */ - public static int getNumAlignmentBlocks(final SAMRecord r) { - int n = 0; - final Cigar cigar = r.getCigar(); - if (cigar == null) return 0; - - for (final CigarElement e : cigar.getCigarElements()) { - if (e.getOperator() == CigarOperator.M ) n++; - } - - return n; - } - - public static int getNumAlignedBases(final SAMRecord r) { - int n = 0; - final Cigar cigar = r.getCigar(); - if (cigar == null) return 0; - - for (final CigarElement e : cigar.getCigarElements()) { - if (e.getOperator() == CigarOperator.M ) { n += e.getLength(); } - } - - return n; - } - - public static byte[] alignmentToByteArray( final Cigar cigar, final byte[] read, final byte[] ref ) { - - final byte[] alignment = new byte[read.length]; - int refPos = 0; - int alignPos = 0; - - for ( int iii = 0 ; iii < cigar.numCigarElements() ; iii++ ) { - - final CigarElement ce = cigar.getCigarElement(iii); - final int elementLength = ce.getLength(); - - switch( ce.getOperator() ) { - case I: - case S: - for ( int jjj = 0; jjj < elementLength; jjj++ ) { - alignment[alignPos++] = '+'; - } - break; - case D: - case N: - refPos += elementLength; - break; - case M: - for ( int jjj = 0; jjj < elementLength; jjj++ ) { - alignment[alignPos] = ref[refPos]; - alignPos++; - refPos++; - } - break; - case H: - case P: - break; - default: - throw new ReviewedStingException( "Unsupported cigar operator: " + ce.getOperator() ); - } - } - return alignment; - } - - public static int calcAlignmentByteArrayOffset( final Cigar cigar, int pileupOffset, final int alignmentStart, final int refLocus ) { - - boolean atDeletion = false; - if(pileupOffset == -1) { - atDeletion = true; - pileupOffset = refLocus - alignmentStart; - final CigarElement ce = cigar.getCigarElement(0); - if( ce.getOperator() == CigarOperator.S ) { - pileupOffset += ce.getLength(); - } - } - int pos = 0; - int alignmentPos = 0; - for ( int iii = 0 ; iii < cigar.numCigarElements() ; iii++ ) { - - final CigarElement ce = cigar.getCigarElement(iii); - final int elementLength = ce.getLength(); - - switch( ce.getOperator() ) { - case I: - case S: - pos += elementLength; - if( pos >= pileupOffset ) { - return alignmentPos; - } - break; - case D: - case N: - if(!atDeletion) { - alignmentPos += elementLength; - } else { - if( pos + elementLength - 1 >= pileupOffset ) { - return alignmentPos + (pileupOffset - pos); - } else { - pos += elementLength; - alignmentPos += elementLength; - } - } - break; - case M: - if( pos + elementLength - 1 >= pileupOffset ) { - return alignmentPos + (pileupOffset - pos); - } else { - pos += elementLength; - alignmentPos += elementLength; - } - break; - case H: - case P: - break; - default: - throw new ReviewedStingException( "Unsupported cigar operator: " + ce.getOperator() ); - } - } - return alignmentPos; - } - - public static byte[] readToAlignmentByteArray( final Cigar cigar, final byte[] read ) { - - int alignmentLength = 0; - for ( int iii = 0 ; iii < cigar.numCigarElements() ; iii++ ) { - - final CigarElement ce = cigar.getCigarElement(iii); - final int elementLength = ce.getLength(); - - switch( ce.getOperator() ) { - case I: - case S: - break; - case D: - case N: - alignmentLength += elementLength; - break; - case M: - alignmentLength += elementLength; - break; - case H: - case P: - break; - default: - throw new ReviewedStingException( "Unsupported cigar operator: " + ce.getOperator() ); - } - } - - final byte[] alignment = new byte[alignmentLength]; - int alignPos = 0; - int readPos = 0; - for ( int iii = 0 ; iii < cigar.numCigarElements() ; iii++ ) { - - final CigarElement ce = cigar.getCigarElement(iii); - final int elementLength = ce.getLength(); - - switch( ce.getOperator() ) { - case I: - if( alignPos > 0 ) { - if( alignment[alignPos-1] == BaseUtils.A ) { alignment[alignPos-1] = PileupElement.A_FOLLOWED_BY_INSERTION_BASE; } - else if( alignment[alignPos-1] == BaseUtils.C ) { alignment[alignPos-1] = PileupElement.C_FOLLOWED_BY_INSERTION_BASE; } - else if( alignment[alignPos-1] == BaseUtils.T ) { alignment[alignPos-1] = PileupElement.T_FOLLOWED_BY_INSERTION_BASE; } - else if( alignment[alignPos-1] == BaseUtils.G ) { alignment[alignPos-1] = PileupElement.G_FOLLOWED_BY_INSERTION_BASE; } - } - case S: - for ( int jjj = 0; jjj < elementLength; jjj++ ) { - readPos++; - } - break; - case D: - case N: - for ( int jjj = 0; jjj < elementLength; jjj++ ) { - alignment[alignPos] = PileupElement.DELETION_BASE; - alignPos++; - } - break; - case M: - for ( int jjj = 0; jjj < elementLength; jjj++ ) { - alignment[alignPos] = read[readPos]; - alignPos++; - readPos++; - } - break; - case H: - case P: - break; - default: - throw new ReviewedStingException( "Unsupported cigar operator: " + ce.getOperator() ); - } - } - return alignment; - } - - /** - * Due to (unfortunate) multiple ways to indicate that read is unmapped allowed by SAM format - * specification, one may need this convenience shortcut. Checks both 'read unmapped' flag and - * alignment reference index/start. - * @param r record - * @return true if read is unmapped - */ - public static boolean isReadUnmapped(final SAMRecord r) { - if ( r.getReadUnmappedFlag() ) return true; - - // our life would be so much easier if all sam files followed the specs. In reality, - // sam files (including those generated by maq or bwa) miss headers altogether. When - // reading such a SAM file, reference name is set, but since there is no sequence dictionary, - // null is always returned for referenceIndex. Let's be paranoid here, and make sure that - // we do not call the read "unmapped" when it has only reference name set with ref. index missing - // or vice versa. - if ( ( r.getReferenceIndex() != null && r.getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX - || r.getReferenceName() != null && !r.getReferenceName().equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME) ) - && r.getAlignmentStart() != SAMRecord.NO_ALIGNMENT_START ) return false ; - return true; - } - - /** - * Due to (unfortunate) multiple ways to indicate that read/mate is unmapped allowed by SAM format - * specification, one may need this convenience shortcut. Checks both 'mate unmapped' flag and - * alignment reference index/start of the mate. - * @param r sam record for the read - * @return true if read's mate is unmapped - */ - public static boolean isMateUnmapped(final SAMRecord r) { - if ( r.getMateUnmappedFlag() ) return true; - - // our life would be so much easier if all sam files followed the specs. In reality, - // sam files (including those generated by maq or bwa) miss headers altogether. When - // reading such a SAM file, reference name is set, but since there is no sequence dictionary, - // null is always returned for referenceIndex. Let's be paranoid here, and make sure that - // we do not call the read "unmapped" when it has only reference name set with ref. index missing - // or vice versa. - if ( ( r.getMateReferenceIndex() != null && r.getMateReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX - || r.getMateReferenceName() != null && !r.getMateReferenceName().equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME) ) - && r.getMateAlignmentStart() != SAMRecord.NO_ALIGNMENT_START ) return false ; - return true; - } - - /** Returns true is read is mapped and mapped uniquely (Q>0). - * - * @param read - * @return - */ - public static boolean isReadUniquelyMapped(SAMRecord read) { - return ( ! AlignmentUtils.isReadUnmapped(read) ) && read.getMappingQuality() > 0; - } - - /** Returns the array of base qualitites in the order the bases were read on the machine (i.e. always starting from - * cycle 1). In other words, if the read is unmapped or aligned in the forward direction, the read's own base - * qualities are returned as stored in the SAM record; if the read is aligned in the reverse direction, the array - * of read's base qualitites is inverted (in this case new array is allocated and returned). - * @param read - * @return - */ - public static byte [] getQualsInCycleOrder(SAMRecord read) { - if ( isReadUnmapped(read) || ! read.getReadNegativeStrandFlag() ) return read.getBaseQualities(); - - return Utils.reverse(read.getBaseQualities()); - } - - /** Returns the array of original base qualitites (before recalibration) in the order the bases were read on the machine (i.e. always starting from - * cycle 1). In other words, if the read is unmapped or aligned in the forward direction, the read's own base - * qualities are returned as stored in the SAM record; if the read is aligned in the reverse direction, the array - * of read's base qualitites is inverted (in this case new array is allocated and returned). If no original base qualities - * are available this method will throw a runtime exception. - * @param read - * @return - */ - public static byte [] getOriginalQualsInCycleOrder(SAMRecord read) { - if ( isReadUnmapped(read) || ! read.getReadNegativeStrandFlag() ) return read.getOriginalBaseQualities(); - - return Utils.reverse(read.getOriginalBaseQualities()); - } - - /** Takes the alignment of the read sequence readSeq to the reference sequence refSeq - * starting at 0-based position refIndex on the refSeq and specified by its cigar. - * The last argument readIndex specifies 0-based position on the read where the alignment described by the - * cigar starts. Usually cigars specify alignments of the whole read to the ref, so that readIndex is normally 0. - * Use non-zero readIndex only when the alignment cigar represents alignment of a part of the read. The refIndex in this case - * should be the position where the alignment of that part of the read starts at. In other words, both refIndex and readIndex are - * always the positions where the cigar starts on the ref and on the read, respectively. - * - * If the alignment has an indel, then this method attempts moving this indel left across a stretch of repetitive bases. For instance, if the original cigar - * specifies that (any) one AT is deleted from a repeat sequence TATATATA, the output cigar will always mark the leftmost AT - * as deleted. If there is no indel in the original cigar, or the indel position is determined unambiguously (i.e. inserted/deleted sequence - * is not repeated), the original cigar is returned. - * @param cigar structure of the original alignment - * @param refSeq reference sequence the read is aligned to - * @param readSeq read sequence - * @param refIndex 0-based alignment start position on ref - * @param readIndex 0-based alignment start position on read - * @return a cigar, in which indel is guaranteed to be placed at the leftmost possible position across a repeat (if any) - */ - public static Cigar leftAlignIndel(Cigar cigar, final byte[] refSeq, final byte[] readSeq, final int refIndex, final int readIndex) { - - int indexOfIndel = -1; - for ( int i = 0; i < cigar.numCigarElements(); i++ ) { - CigarElement ce = cigar.getCigarElement(i); - if ( ce.getOperator() == CigarOperator.D || ce.getOperator() == CigarOperator.I ) { - // if there is more than 1 indel, don't left align - if ( indexOfIndel != -1 ) - return cigar; - indexOfIndel = i; - } - } - - // if there is no indel or if the alignment starts with an insertion (so that there - // is no place on the read to move that insertion further left), we are done - if ( indexOfIndel < 1 ) return cigar; - - final int indelLength = cigar.getCigarElement(indexOfIndel).getLength(); - - byte[] altString = createIndelString(cigar, indexOfIndel, refSeq, readSeq, refIndex, readIndex); - if ( altString == null ) - return cigar; - - Cigar newCigar = cigar; - for ( int i = 0; i < indelLength; i++ ) { - newCigar = moveCigarLeft(newCigar, indexOfIndel); - byte[] newAltString = createIndelString(newCigar, indexOfIndel, refSeq, readSeq, refIndex, readIndex); - - // check to make sure we haven't run off the end of the read - boolean reachedEndOfRead = cigarHasZeroSizeElement(newCigar); - - if ( Arrays.equals(altString, newAltString) ) { - cigar = newCigar; - i = -1; - if ( reachedEndOfRead ) - cigar = cleanUpCigar(cigar); - } - - if ( reachedEndOfRead ) - break; - } - - return cigar; - } - - private static boolean cigarHasZeroSizeElement(Cigar c) { - for ( CigarElement ce : c.getCigarElements() ) { - if ( ce.getLength() == 0 ) - return true; - } - return false; - } - - private static Cigar cleanUpCigar(Cigar c) { - ArrayList elements = new ArrayList(c.numCigarElements()-1); - for ( CigarElement ce : c.getCigarElements() ) { - if ( ce.getLength() != 0 && - (elements.size() != 0 || ce.getOperator() != CigarOperator.D) ) { - elements.add(ce); - } - } - return new Cigar(elements); - } - - private static Cigar moveCigarLeft(Cigar cigar, int indexOfIndel) { - // get the first few elements - ArrayList elements = new ArrayList(cigar.numCigarElements()); - for ( int i = 0; i < indexOfIndel - 1; i++) - elements.add(cigar.getCigarElement(i)); - - // get the indel element and move it left one base - CigarElement ce = cigar.getCigarElement(indexOfIndel-1); - elements.add(new CigarElement(ce.getLength()-1, ce.getOperator())); - elements.add(cigar.getCigarElement(indexOfIndel)); - if ( indexOfIndel+1 < cigar.numCigarElements() ) { - ce = cigar.getCigarElement(indexOfIndel+1); - elements.add(new CigarElement(ce.getLength()+1, ce.getOperator())); - } else { - elements.add(new CigarElement(1, CigarOperator.M)); - } - - // get the last few elements - for ( int i = indexOfIndel + 2; i < cigar.numCigarElements(); i++) - elements.add(cigar.getCigarElement(i)); - return new Cigar(elements); - } - - private static byte[] createIndelString(final Cigar cigar, final int indexOfIndel, final byte[] refSeq, final byte[] readSeq, int refIndex, int readIndex) { - CigarElement indel = cigar.getCigarElement(indexOfIndel); - int indelLength = indel.getLength(); - - int totalRefBases = 0; - for ( int i = 0; i < indexOfIndel; i++ ) { - CigarElement ce = cigar.getCigarElement(i); - int length = ce.getLength(); - - switch( ce.getOperator() ) { - case M: - readIndex += length; - refIndex += length; - totalRefBases += length; - break; - case S: - readIndex += length; - break; - case N: - refIndex += length; - totalRefBases += length; - break; - default: - break; - } - } - - // sometimes, when there are very large known indels, we won't have enough reference sequence to cover them - if ( totalRefBases + indelLength > refSeq.length ) - indelLength -= (totalRefBases + indelLength - refSeq.length); - - // the indel-based reference string - byte[] alt = new byte[refSeq.length + (indelLength * (indel.getOperator() == CigarOperator.D ? -1 : 1))]; - - // add the bases before the indel, making sure it's not aligned off the end of the reference - if ( refIndex > alt.length || refIndex > refSeq.length ) - return null; - System.arraycopy(refSeq, 0, alt, 0, refIndex); - int currentPos = refIndex; - - // take care of the indel - if ( indel.getOperator() == CigarOperator.D ) { - refIndex += indelLength; - } else { - System.arraycopy(readSeq, readIndex, alt, currentPos, indelLength); - currentPos += indelLength; - } - - // add the bases after the indel, making sure it's not aligned off the end of the reference - if ( refSeq.length - refIndex > alt.length - currentPos ) - return null; - System.arraycopy(refSeq, refIndex, alt, currentPos, refSeq.length - refIndex); - - return alt; - } -} +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.sam; + +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.BitSet; + + +public class AlignmentUtils { + + public static class MismatchCount { + public int numMismatches = 0; + public long mismatchQualities = 0; + } + + public static long mismatchingQualities(SAMRecord r, byte[] refSeq, int refIndex) { + return getMismatchCount(r, refSeq, refIndex).mismatchQualities; + } + + public static MismatchCount getMismatchCount(SAMRecord r, byte[] refSeq, int refIndex) { + return getMismatchCount(r, refSeq, refIndex, 0, r.getReadLength()); + } + + // todo -- this code and mismatchesInRefWindow should be combined and optimized into a single + // todo -- high performance implementation. We can do a lot better than this right now + public static MismatchCount getMismatchCount(SAMRecord r, byte[] refSeq, int refIndex, int startOnRead, int nReadBases) { + MismatchCount mc = new MismatchCount(); + + int readIdx = 0; + int endOnRead = startOnRead + nReadBases - 1; // index of the last base on read we want to count + byte[] readSeq = r.getReadBases(); + Cigar c = r.getCigar(); + for (int i = 0; i < c.numCigarElements(); i++) { + + if (readIdx > endOnRead) break; + + CigarElement ce = c.getCigarElement(i); + switch (ce.getOperator()) { + case M: + for (int j = 0; j < ce.getLength(); j++, refIndex++, readIdx++) { + if (refIndex >= refSeq.length) + continue; + if (readIdx < startOnRead) continue; + if (readIdx > endOnRead) break; + byte refChr = refSeq[refIndex]; + byte readChr = readSeq[readIdx]; + // Note: we need to count X/N's as mismatches because that's what SAM requires + //if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 || + // BaseUtils.simpleBaseToBaseIndex(refChr) == -1 ) + // continue; // do not count Ns/Xs/etc ? + if (readChr != refChr) { + mc.numMismatches++; + mc.mismatchQualities += r.getBaseQualities()[readIdx]; + } + } + break; + case I: + case S: + readIdx += ce.getLength(); + break; + case D: + case N: + refIndex += ce.getLength(); + break; + case H: + case P: + break; + default: + throw new ReviewedStingException("The " + ce.getOperator() + " cigar element is not currently supported"); + } + + } + return mc; + } + + /** + * Returns the number of mismatches in the pileup within the given reference context. + * + * @param pileup the pileup with reads + * @param ref the reference context + * @param ignoreTargetSite if true, ignore mismatches at the target locus (i.e. the center of the window) + * @return the number of mismatches + */ + public static int mismatchesInRefWindow(ReadBackedPileup pileup, ReferenceContext ref, boolean ignoreTargetSite) { + int mismatches = 0; + for (PileupElement p : pileup) + mismatches += mismatchesInRefWindow(p, ref, ignoreTargetSite); + return mismatches; + } + + /** + * Returns the number of mismatches in the pileup element within the given reference context. + * + * @param p the pileup element + * @param ref the reference context + * @param ignoreTargetSite if true, ignore mismatches at the target locus (i.e. the center of the window) + * @return the number of mismatches + */ + public static int mismatchesInRefWindow(PileupElement p, ReferenceContext ref, boolean ignoreTargetSite) { + return mismatchesInRefWindow(p, ref, ignoreTargetSite, false); + } + + /** + * Returns the number of mismatches in the pileup element within the given reference context. + * + * @param p the pileup element + * @param ref the reference context + * @param ignoreTargetSite if true, ignore mismatches at the target locus (i.e. the center of the window) + * @param qualitySumInsteadOfMismatchCount + * if true, return the quality score sum of the mismatches rather than the count + * @return the number of mismatches + */ + public static int mismatchesInRefWindow(PileupElement p, ReferenceContext ref, boolean ignoreTargetSite, boolean qualitySumInsteadOfMismatchCount) { + int sum = 0; + + int windowStart = ref.getWindow().getStart(); + int windowStop = ref.getWindow().getStop(); + byte[] refBases = ref.getBases(); + byte[] readBases = p.getRead().getReadBases(); + byte[] readQualities = p.getRead().getBaseQualities(); + Cigar c = p.getRead().getCigar(); + + int readIndex = 0; + int currentPos = p.getRead().getAlignmentStart(); + int refIndex = Math.max(0, currentPos - windowStart); + + for (int i = 0; i < c.numCigarElements(); i++) { + CigarElement ce = c.getCigarElement(i); + int cigarElementLength = ce.getLength(); + switch (ce.getOperator()) { + case M: + for (int j = 0; j < cigarElementLength; j++, readIndex++, currentPos++) { + // are we past the ref window? + if (currentPos > windowStop) + break; + + // are we before the ref window? + if (currentPos < windowStart) + continue; + + byte refChr = refBases[refIndex++]; + + // do we need to skip the target site? + if (ignoreTargetSite && ref.getLocus().getStart() == currentPos) + continue; + + byte readChr = readBases[readIndex]; + if (readChr != refChr) + sum += (qualitySumInsteadOfMismatchCount) ? readQualities[readIndex] : 1; + } + break; + case I: + case S: + readIndex += cigarElementLength; + break; + case D: + case N: + currentPos += cigarElementLength; + if (currentPos > windowStart) + refIndex += Math.min(cigarElementLength, currentPos - windowStart); + break; + case H: + case P: + break; + } + } + + return sum; + } + + /** + * Returns the number of mismatches in the pileup element within the given reference context. + * + * @param read the SAMRecord + * @param ref the reference context + * @param maxMismatches the maximum number of surrounding mismatches we tolerate to consider a base good + * @param windowSize window size (on each side) to test + * @return a bitset representing which bases are good + */ + public static BitSet mismatchesInRefWindow(SAMRecord read, ReferenceContext ref, int maxMismatches, int windowSize) { + // first determine the positions with mismatches + int readLength = read.getReadLength(); + BitSet mismatches = new BitSet(readLength); + + // it's possible we aren't starting at the beginning of a read, + // and we don't need to look at any of the previous context outside our window + // (although we do need future context) + int readStartPos = Math.max(read.getAlignmentStart(), ref.getLocus().getStart() - windowSize); + int currentReadPos = read.getAlignmentStart(); + + byte[] refBases = ref.getBases(); + int refIndex = readStartPos - ref.getWindow().getStart(); + if (refIndex < 0) { + throw new IllegalStateException("When calculating mismatches, we somehow don't have enough previous reference context for read " + read.getReadName() + " at position " + ref.getLocus()); + } + + byte[] readBases = read.getReadBases(); + int readIndex = 0; + + Cigar c = read.getCigar(); + + for (int i = 0; i < c.numCigarElements(); i++) { + CigarElement ce = c.getCigarElement(i); + int cigarElementLength = ce.getLength(); + switch (ce.getOperator()) { + case M: + for (int j = 0; j < cigarElementLength; j++, readIndex++) { + // skip over unwanted bases + if (currentReadPos++ < readStartPos) + continue; + + // this is possible if reads extend beyond the contig end + if (refIndex >= refBases.length) + break; + + byte refChr = refBases[refIndex]; + byte readChr = readBases[readIndex]; + if (readChr != refChr) + mismatches.set(readIndex); + + refIndex++; + } + break; + case I: + case S: + readIndex += cigarElementLength; + break; + case D: + case N: + if (currentReadPos >= readStartPos) + refIndex += cigarElementLength; + currentReadPos += cigarElementLength; + break; + case H: + case P: + break; + } + } + + // all bits are set to false by default + BitSet result = new BitSet(readLength); + + int currentPos = 0, leftPos = 0, rightPos; + int mismatchCount = 0; + + // calculate how many mismatches exist in the windows to the left/right + for (rightPos = 1; rightPos <= windowSize && rightPos < readLength; rightPos++) { + if (mismatches.get(rightPos)) + mismatchCount++; + } + if (mismatchCount <= maxMismatches) + result.set(currentPos); + + // now, traverse over the read positions + while (currentPos < readLength) { + // add a new rightmost position + if (rightPos < readLength && mismatches.get(rightPos++)) + mismatchCount++; + // re-penalize the previous position + if (mismatches.get(currentPos++)) + mismatchCount++; + // don't penalize the current position + if (mismatches.get(currentPos)) + mismatchCount--; + // subtract the leftmost position + if (leftPos < currentPos - windowSize && mismatches.get(leftPos++)) + mismatchCount--; + + if (mismatchCount <= maxMismatches) + result.set(currentPos); + } + + return result; + } + + /** + * Returns number of alignment blocks (continuous stretches of aligned bases) in the specified alignment. + * This method follows closely the SAMRecord::getAlignmentBlocks() implemented in samtools library, but + * it only counts blocks without actually allocating and filling the list of blocks themselves. Hence, this method is + * a much more efficient alternative to r.getAlignmentBlocks.size() in the situations when this number is all that is needed. + * Formally, this method simply returns the number of M elements in the cigar. + * + * @param r alignment + * @return number of continuous alignment blocks (i.e. 'M' elements of the cigar; all indel and clipping elements are ignored). + */ + public static int getNumAlignmentBlocks(final SAMRecord r) { + int n = 0; + final Cigar cigar = r.getCigar(); + if (cigar == null) return 0; + + for (final CigarElement e : cigar.getCigarElements()) { + if (e.getOperator() == CigarOperator.M) n++; + } + + return n; + } + + public static int getNumAlignedBases(final SAMRecord r) { + int n = 0; + final Cigar cigar = r.getCigar(); + if (cigar == null) return 0; + + for (final CigarElement e : cigar.getCigarElements()) + if (e.getOperator() == CigarOperator.M) + n += e.getLength(); + + return n; + } + + public static byte[] alignmentToByteArray(final Cigar cigar, final byte[] read, final byte[] ref) { + + final byte[] alignment = new byte[read.length]; + int refPos = 0; + int alignPos = 0; + + for (int iii = 0; iii < cigar.numCigarElements(); iii++) { + + final CigarElement ce = cigar.getCigarElement(iii); + final int elementLength = ce.getLength(); + + switch (ce.getOperator()) { + case I: + case S: + for (int jjj = 0; jjj < elementLength; jjj++) { + alignment[alignPos++] = '+'; + } + break; + case D: + case N: + refPos += elementLength; + break; + case M: + for (int jjj = 0; jjj < elementLength; jjj++) { + alignment[alignPos] = ref[refPos]; + alignPos++; + refPos++; + } + break; + case H: + case P: + break; + default: + throw new ReviewedStingException("Unsupported cigar operator: " + ce.getOperator()); + } + } + return alignment; + } + + public static int calcAlignmentByteArrayOffset(final Cigar cigar, PileupElement pileup, final int alignmentStart, final int refLocus) { + int pileupOffset = pileup.getOffset(); + + // Special case for reads starting with insertion + if (pileup.isInsertionAtBeginningOfRead()) + return 0; + + // Reassign the offset if we are in the middle of a deletion because of the modified representation of the read bases + if (pileup.isDeletion()) { + pileupOffset = refLocus - alignmentStart; + final CigarElement ce = cigar.getCigarElement(0); + if (ce.getOperator() == CigarOperator.S) { + pileupOffset += ce.getLength(); + } + } + + int pos = 0; + int alignmentPos = 0; + + for (int iii = 0; iii < cigar.numCigarElements(); iii++) { + final CigarElement ce = cigar.getCigarElement(iii); + final int elementLength = ce.getLength(); + + switch (ce.getOperator()) { + case I: + case S: + pos += elementLength; + if (pos >= pileupOffset) { + return alignmentPos; + } + break; + case D: + case N: + if (!pileup.isDeletion()) { + alignmentPos += elementLength; + } else { + if (pos + elementLength - 1 >= pileupOffset) { + return alignmentPos + (pileupOffset - pos); + } else { + pos += elementLength; + alignmentPos += elementLength; + } + } + break; + case M: + if (pos + elementLength - 1 >= pileupOffset) { + return alignmentPos + (pileupOffset - pos); + } else { + pos += elementLength; + alignmentPos += elementLength; + } + break; + case H: + case P: + break; + default: + throw new ReviewedStingException("Unsupported cigar operator: " + ce.getOperator()); + } + } + + return alignmentPos; + } + + public static byte[] readToAlignmentByteArray(final Cigar cigar, final byte[] read) { + + int alignmentLength = 0; + for (int iii = 0; iii < cigar.numCigarElements(); iii++) { + + final CigarElement ce = cigar.getCigarElement(iii); + final int elementLength = ce.getLength(); + + switch (ce.getOperator()) { + case I: + case S: + break; + case D: + case N: + alignmentLength += elementLength; + break; + case M: + alignmentLength += elementLength; + break; + case H: + case P: + break; + default: + throw new ReviewedStingException("Unsupported cigar operator: " + ce.getOperator()); + } + } + + final byte[] alignment = new byte[alignmentLength]; + int alignPos = 0; + int readPos = 0; + for (int iii = 0; iii < cigar.numCigarElements(); iii++) { + + final CigarElement ce = cigar.getCigarElement(iii); + final int elementLength = ce.getLength(); + + switch (ce.getOperator()) { + case I: + if (alignPos > 0) { + if (alignment[alignPos - 1] == BaseUtils.A) { + alignment[alignPos - 1] = PileupElement.A_FOLLOWED_BY_INSERTION_BASE; + } else if (alignment[alignPos - 1] == BaseUtils.C) { + alignment[alignPos - 1] = PileupElement.C_FOLLOWED_BY_INSERTION_BASE; + } else if (alignment[alignPos - 1] == BaseUtils.T) { + alignment[alignPos - 1] = PileupElement.T_FOLLOWED_BY_INSERTION_BASE; + } else if (alignment[alignPos - 1] == BaseUtils.G) { + alignment[alignPos - 1] = PileupElement.G_FOLLOWED_BY_INSERTION_BASE; + } + } + case S: + for (int jjj = 0; jjj < elementLength; jjj++) { + readPos++; + } + break; + case D: + case N: + for (int jjj = 0; jjj < elementLength; jjj++) { + alignment[alignPos] = PileupElement.DELETION_BASE; + alignPos++; + } + break; + case M: + for (int jjj = 0; jjj < elementLength; jjj++) { + alignment[alignPos] = read[readPos]; + alignPos++; + readPos++; + } + break; + case H: + case P: + break; + default: + throw new ReviewedStingException("Unsupported cigar operator: " + ce.getOperator()); + } + } + return alignment; + } + + /** + * Due to (unfortunate) multiple ways to indicate that read is unmapped allowed by SAM format + * specification, one may need this convenience shortcut. Checks both 'read unmapped' flag and + * alignment reference index/start. + * + * @param r record + * @return true if read is unmapped + */ + public static boolean isReadUnmapped(final SAMRecord r) { + if (r.getReadUnmappedFlag()) return true; + + // our life would be so much easier if all sam files followed the specs. In reality, + // sam files (including those generated by maq or bwa) miss headers altogether. When + // reading such a SAM file, reference name is set, but since there is no sequence dictionary, + // null is always returned for referenceIndex. Let's be paranoid here, and make sure that + // we do not call the read "unmapped" when it has only reference name set with ref. index missing + // or vice versa. + if ((r.getReferenceIndex() != null && r.getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX + || r.getReferenceName() != null && !r.getReferenceName().equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME)) + && r.getAlignmentStart() != SAMRecord.NO_ALIGNMENT_START) return false; + return true; + } + + /** + * Due to (unfortunate) multiple ways to indicate that read/mate is unmapped allowed by SAM format + * specification, one may need this convenience shortcut. Checks both 'mate unmapped' flag and + * alignment reference index/start of the mate. + * + * @param r sam record for the read + * @return true if read's mate is unmapped + */ + public static boolean isMateUnmapped(final SAMRecord r) { + if (r.getMateUnmappedFlag()) return true; + + // our life would be so much easier if all sam files followed the specs. In reality, + // sam files (including those generated by maq or bwa) miss headers altogether. When + // reading such a SAM file, reference name is set, but since there is no sequence dictionary, + // null is always returned for referenceIndex. Let's be paranoid here, and make sure that + // we do not call the read "unmapped" when it has only reference name set with ref. index missing + // or vice versa. + if ((r.getMateReferenceIndex() != null && r.getMateReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX + || r.getMateReferenceName() != null && !r.getMateReferenceName().equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME)) + && r.getMateAlignmentStart() != SAMRecord.NO_ALIGNMENT_START) return false; + return true; + } + + /** + * Returns true is read is mapped and mapped uniquely (Q>0). + * + * @param read + * @return + */ + public static boolean isReadUniquelyMapped(SAMRecord read) { + return (!AlignmentUtils.isReadUnmapped(read)) && read.getMappingQuality() > 0; + } + + /** + * Returns the array of base qualitites in the order the bases were read on the machine (i.e. always starting from + * cycle 1). In other words, if the read is unmapped or aligned in the forward direction, the read's own base + * qualities are returned as stored in the SAM record; if the read is aligned in the reverse direction, the array + * of read's base qualitites is inverted (in this case new array is allocated and returned). + * + * @param read + * @return + */ + public static byte[] getQualsInCycleOrder(SAMRecord read) { + if (isReadUnmapped(read) || !read.getReadNegativeStrandFlag()) return read.getBaseQualities(); + + return Utils.reverse(read.getBaseQualities()); + } + + /** + * Returns the array of original base qualitites (before recalibration) in the order the bases were read on the machine (i.e. always starting from + * cycle 1). In other words, if the read is unmapped or aligned in the forward direction, the read's own base + * qualities are returned as stored in the SAM record; if the read is aligned in the reverse direction, the array + * of read's base qualitites is inverted (in this case new array is allocated and returned). If no original base qualities + * are available this method will throw a runtime exception. + * + * @param read + * @return + */ + public static byte[] getOriginalQualsInCycleOrder(SAMRecord read) { + if (isReadUnmapped(read) || !read.getReadNegativeStrandFlag()) return read.getOriginalBaseQualities(); + + return Utils.reverse(read.getOriginalBaseQualities()); + } + + /** + * Takes the alignment of the read sequence readSeq to the reference sequence refSeq + * starting at 0-based position refIndex on the refSeq and specified by its cigar. + * The last argument readIndex specifies 0-based position on the read where the alignment described by the + * cigar starts. Usually cigars specify alignments of the whole read to the ref, so that readIndex is normally 0. + * Use non-zero readIndex only when the alignment cigar represents alignment of a part of the read. The refIndex in this case + * should be the position where the alignment of that part of the read starts at. In other words, both refIndex and readIndex are + * always the positions where the cigar starts on the ref and on the read, respectively. + *

+ * If the alignment has an indel, then this method attempts moving this indel left across a stretch of repetitive bases. For instance, if the original cigar + * specifies that (any) one AT is deleted from a repeat sequence TATATATA, the output cigar will always mark the leftmost AT + * as deleted. If there is no indel in the original cigar, or the indel position is determined unambiguously (i.e. inserted/deleted sequence + * is not repeated), the original cigar is returned. + * + * @param cigar structure of the original alignment + * @param refSeq reference sequence the read is aligned to + * @param readSeq read sequence + * @param refIndex 0-based alignment start position on ref + * @param readIndex 0-based alignment start position on read + * @return a cigar, in which indel is guaranteed to be placed at the leftmost possible position across a repeat (if any) + */ + public static Cigar leftAlignIndel(Cigar cigar, final byte[] refSeq, final byte[] readSeq, final int refIndex, final int readIndex) { + + int indexOfIndel = -1; + for (int i = 0; i < cigar.numCigarElements(); i++) { + CigarElement ce = cigar.getCigarElement(i); + if (ce.getOperator() == CigarOperator.D || ce.getOperator() == CigarOperator.I) { + // if there is more than 1 indel, don't left align + if (indexOfIndel != -1) + return cigar; + indexOfIndel = i; + } + } + + // if there is no indel or if the alignment starts with an insertion (so that there + // is no place on the read to move that insertion further left), we are done + if (indexOfIndel < 1) return cigar; + + final int indelLength = cigar.getCigarElement(indexOfIndel).getLength(); + + byte[] altString = createIndelString(cigar, indexOfIndel, refSeq, readSeq, refIndex, readIndex); + if (altString == null) + return cigar; + + Cigar newCigar = cigar; + for (int i = 0; i < indelLength; i++) { + newCigar = moveCigarLeft(newCigar, indexOfIndel); + byte[] newAltString = createIndelString(newCigar, indexOfIndel, refSeq, readSeq, refIndex, readIndex); + + // check to make sure we haven't run off the end of the read + boolean reachedEndOfRead = cigarHasZeroSizeElement(newCigar); + + if (Arrays.equals(altString, newAltString)) { + cigar = newCigar; + i = -1; + if (reachedEndOfRead) + cigar = cleanUpCigar(cigar); + } + + if (reachedEndOfRead) + break; + } + + return cigar; + } + + private static boolean cigarHasZeroSizeElement(Cigar c) { + for (CigarElement ce : c.getCigarElements()) { + if (ce.getLength() == 0) + return true; + } + return false; + } + + private static Cigar cleanUpCigar(Cigar c) { + ArrayList elements = new ArrayList(c.numCigarElements() - 1); + for (CigarElement ce : c.getCigarElements()) { + if (ce.getLength() != 0 && + (elements.size() != 0 || ce.getOperator() != CigarOperator.D)) { + elements.add(ce); + } + } + return new Cigar(elements); + } + + private static Cigar moveCigarLeft(Cigar cigar, int indexOfIndel) { + // get the first few elements + ArrayList elements = new ArrayList(cigar.numCigarElements()); + for (int i = 0; i < indexOfIndel - 1; i++) + elements.add(cigar.getCigarElement(i)); + + // get the indel element and move it left one base + CigarElement ce = cigar.getCigarElement(indexOfIndel - 1); + elements.add(new CigarElement(ce.getLength() - 1, ce.getOperator())); + elements.add(cigar.getCigarElement(indexOfIndel)); + if (indexOfIndel + 1 < cigar.numCigarElements()) { + ce = cigar.getCigarElement(indexOfIndel + 1); + elements.add(new CigarElement(ce.getLength() + 1, ce.getOperator())); + } else { + elements.add(new CigarElement(1, CigarOperator.M)); + } + + // get the last few elements + for (int i = indexOfIndel + 2; i < cigar.numCigarElements(); i++) + elements.add(cigar.getCigarElement(i)); + return new Cigar(elements); + } + + private static byte[] createIndelString(final Cigar cigar, final int indexOfIndel, final byte[] refSeq, final byte[] readSeq, int refIndex, int readIndex) { + CigarElement indel = cigar.getCigarElement(indexOfIndel); + int indelLength = indel.getLength(); + + int totalRefBases = 0; + for (int i = 0; i < indexOfIndel; i++) { + CigarElement ce = cigar.getCigarElement(i); + int length = ce.getLength(); + + switch (ce.getOperator()) { + case M: + readIndex += length; + refIndex += length; + totalRefBases += length; + break; + case S: + readIndex += length; + break; + case N: + refIndex += length; + totalRefBases += length; + break; + default: + break; + } + } + + // sometimes, when there are very large known indels, we won't have enough reference sequence to cover them + if (totalRefBases + indelLength > refSeq.length) + indelLength -= (totalRefBases + indelLength - refSeq.length); + + // the indel-based reference string + byte[] alt = new byte[refSeq.length + (indelLength * (indel.getOperator() == CigarOperator.D ? -1 : 1))]; + + // add the bases before the indel, making sure it's not aligned off the end of the reference + if (refIndex > alt.length || refIndex > refSeq.length) + return null; + System.arraycopy(refSeq, 0, alt, 0, refIndex); + int currentPos = refIndex; + + // take care of the indel + if (indel.getOperator() == CigarOperator.D) { + refIndex += indelLength; + } else { + System.arraycopy(readSeq, readIndex, alt, currentPos, indelLength); + currentPos += indelLength; + } + + // add the bases after the indel, making sure it's not aligned off the end of the reference + if (refSeq.length - refIndex > alt.length - currentPos) + return null; + System.arraycopy(refSeq, refIndex, alt, currentPos, refSeq.length - refIndex); + + return alt; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java index 542adea77..8661d5ad0 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java @@ -27,7 +27,7 @@ public class ArtificialSAMUtils { * @param chromosomeSize how large each chromosome is * @param readsPerChomosome how many reads to make in each chromosome. They'll be aligned from position 1 to x (which is the number of reads) */ - public static void createArtificialBamFile( String filename, int numberOfChromosomes, int startingChromosome, int chromosomeSize, int readsPerChomosome ) { + public static void createArtificialBamFile(String filename, int numberOfChromosomes, int startingChromosome, int chromosomeSize, int readsPerChomosome) { SAMFileHeader header = createArtificialSamHeader(numberOfChromosomes, startingChromosome, chromosomeSize); File outFile = new File(filename); @@ -51,7 +51,7 @@ public class ArtificialSAMUtils { * @param chromosomeSize how large each chromosome is * @param readsPerChomosome how many reads to make in each chromosome. They'll be aligned from position 1 to x (which is the number of reads) */ - public static void createArtificialSamFile( String filename, int numberOfChromosomes, int startingChromosome, int chromosomeSize, int readsPerChomosome ) { + public static void createArtificialSamFile(String filename, int numberOfChromosomes, int startingChromosome, int chromosomeSize, int readsPerChomosome) { SAMFileHeader header = createArtificialSamHeader(numberOfChromosomes, startingChromosome, chromosomeSize); File outFile = new File(filename); @@ -72,16 +72,15 @@ public class ArtificialSAMUtils { * @param numberOfChromosomes the number of chromosomes to create * @param startingChromosome the starting number for the chromosome (most likely set to 1) * @param chromosomeSize the length of each chromosome - * * @return */ - public static SAMFileHeader createArtificialSamHeader( int numberOfChromosomes, int startingChromosome, int chromosomeSize ) { + public static SAMFileHeader createArtificialSamHeader(int numberOfChromosomes, int startingChromosome, int chromosomeSize) { SAMFileHeader header = new SAMFileHeader(); header.setSortOrder(net.sf.samtools.SAMFileHeader.SortOrder.coordinate); SAMSequenceDictionary dict = new SAMSequenceDictionary(); // make up some sequence records for (int x = startingChromosome; x < startingChromosome + numberOfChromosomes; x++) { - SAMSequenceRecord rec = new SAMSequenceRecord("chr" + ( x ), chromosomeSize /* size */); + SAMSequenceRecord rec = new SAMSequenceRecord("chr" + (x), chromosomeSize /* size */); rec.setSequenceLength(chromosomeSize); dict.addSequence(rec); } @@ -95,10 +94,9 @@ public class ArtificialSAMUtils { * @param header the header to set * @param readGroupID the read group ID tag * @param sampleName the sample name - * * @return the adjusted SAMFileHeader */ - public static SAMFileHeader createDefaultReadGroup( SAMFileHeader header, String readGroupID, String sampleName ) { + public static SAMFileHeader createDefaultReadGroup(SAMFileHeader header, String readGroupID, String sampleName) { SAMReadGroupRecord rec = new SAMReadGroupRecord(readGroupID); rec.setSample(sampleName); List readGroups = new ArrayList(); @@ -113,10 +111,9 @@ public class ArtificialSAMUtils { * @param header the header to set * @param readGroupIDs the read group ID tags * @param sampleNames the sample names - * * @return the adjusted SAMFileHeader */ - public static SAMFileHeader createEnumeratedReadGroups( SAMFileHeader header, List readGroupIDs, List sampleNames ) { + public static SAMFileHeader createEnumeratedReadGroups(SAMFileHeader header, List readGroupIDs, List sampleNames) { if (readGroupIDs.size() != sampleNames.size()) { throw new ReviewedStingException("read group count and sample name count must be the same"); } @@ -137,18 +134,16 @@ public class ArtificialSAMUtils { /** * Create an artificial read based on the parameters. The cigar string will be *M, where * is the length of the read * - * * @param header the SAM header to associate the read with * @param name the name of the read * @param refIndex the reference index, i.e. what chromosome to associate it with * @param alignmentStart where to start the alignment * @param length the length of the read - * * @return the artificial read */ public static GATKSAMRecord createArtificialRead(SAMFileHeader header, String name, int refIndex, int alignmentStart, int length) { - if( (refIndex == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX && alignmentStart != SAMRecord.NO_ALIGNMENT_START) || - (refIndex != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX && alignmentStart == SAMRecord.NO_ALIGNMENT_START) ) + if ((refIndex == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX && alignmentStart != SAMRecord.NO_ALIGNMENT_START) || + (refIndex != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX && alignmentStart == SAMRecord.NO_ALIGNMENT_START)) throw new ReviewedStingException("Invalid alignment start for artificial read, start = " + alignmentStart); GATKSAMRecord record = new GATKSAMRecord(header); record.setReadName(name); @@ -183,10 +178,9 @@ public class ArtificialSAMUtils { * @param alignmentStart where to start the alignment * @param bases the sequence of the read * @param qual the qualities of the read - * * @return the artificial read */ - public static GATKSAMRecord createArtificialRead( SAMFileHeader header, String name, int refIndex, int alignmentStart, byte[] bases, byte[] qual ) { + public static GATKSAMRecord createArtificialRead(SAMFileHeader header, String name, int refIndex, int alignmentStart, byte[] bases, byte[] qual) { if (bases.length != qual.length) { throw new ReviewedStingException("Passed in read string is different length then the quality array"); } @@ -210,10 +204,9 @@ public class ArtificialSAMUtils { * @param bases the sequence of the read * @param qual the qualities of the read * @param cigar the cigar string of the read - * * @return the artificial read */ - public static GATKSAMRecord createArtificialRead( SAMFileHeader header, String name, int refIndex, int alignmentStart, byte[] bases, byte[] qual, String cigar ) { + public static GATKSAMRecord createArtificialRead(SAMFileHeader header, String name, int refIndex, int alignmentStart, byte[] bases, byte[] qual, String cigar) { GATKSAMRecord rec = createArtificialRead(header, name, refIndex, alignmentStart, bases, qual); rec.setCigarString(cigar); return rec; @@ -221,22 +214,21 @@ public class ArtificialSAMUtils { /** * Create an artificial read with the following default parameters : - * header: - * numberOfChromosomes = 1 - * startingChromosome = 1 - * chromosomeSize = 1000000 - * read: - * name = "default_read" - * refIndex = 0 - * alignmentStart = 1 - * - * @param bases the sequence of the read - * @param qual the qualities of the read - * @param cigar the cigar string of the read + * header: + * numberOfChromosomes = 1 + * startingChromosome = 1 + * chromosomeSize = 1000000 + * read: + * name = "default_read" + * refIndex = 0 + * alignmentStart = 1 * + * @param bases the sequence of the read + * @param qual the qualities of the read + * @param cigar the cigar string of the read * @return the artificial read */ - public static GATKSAMRecord createArtificialRead( byte[] bases, byte[] qual, String cigar ) { + public static GATKSAMRecord createArtificialRead(byte[] bases, byte[] qual, String cigar) { SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); return ArtificialSAMUtils.createArtificialRead(header, "default_read", 0, 10000, bases, qual, cigar); } @@ -253,7 +245,7 @@ public class ArtificialSAMUtils { right.setProperPairFlag(true); left.setFirstOfPairFlag(leftIsFirst); - right.setFirstOfPairFlag(! leftIsFirst); + right.setFirstOfPairFlag(!leftIsFirst); left.setReadNegativeStrandFlag(leftIsNegative); left.setMateNegativeStrandFlag(!leftIsNegative); @@ -279,11 +271,10 @@ public class ArtificialSAMUtils { * @param startingChr the chromosome (reference ID) to start from * @param endingChr the id to end with * @param readCount the number of reads per chromosome - * * @return StingSAMIterator representing the specified amount of fake data */ - public static StingSAMIterator mappedReadIterator( int startingChr, int endingChr, int readCount ) { - SAMFileHeader header = createArtificialSamHeader(( endingChr - startingChr ) + 1, startingChr, readCount + DEFAULT_READ_LENGTH); + public static StingSAMIterator mappedReadIterator(int startingChr, int endingChr, int readCount) { + SAMFileHeader header = createArtificialSamHeader((endingChr - startingChr) + 1, startingChr, readCount + DEFAULT_READ_LENGTH); return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, 0, header); } @@ -295,11 +286,10 @@ public class ArtificialSAMUtils { * @param endingChr the id to end with * @param readCount the number of reads per chromosome * @param unmappedReadCount the count of unmapped reads to place at the end of the iterator, like in a sorted bam file - * * @return StingSAMIterator representing the specified amount of fake data */ - public static StingSAMIterator mappedAndUnmappedReadIterator( int startingChr, int endingChr, int readCount, int unmappedReadCount ) { - SAMFileHeader header = createArtificialSamHeader(( endingChr - startingChr ) + 1, startingChr, readCount + DEFAULT_READ_LENGTH); + public static StingSAMIterator mappedAndUnmappedReadIterator(int startingChr, int endingChr, int readCount, int unmappedReadCount) { + SAMFileHeader header = createArtificialSamHeader((endingChr - startingChr) + 1, startingChr, readCount + DEFAULT_READ_LENGTH); return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, unmappedReadCount, header); } @@ -310,11 +300,10 @@ public class ArtificialSAMUtils { * @param startingChr the chromosome (reference ID) to start from * @param endingChr the id to end with * @param readCount the number of reads per chromosome - * * @return StingSAMIterator representing the specified amount of fake data */ - public static ArtificialSAMQueryIterator queryReadIterator( int startingChr, int endingChr, int readCount ) { - SAMFileHeader header = createArtificialSamHeader(( endingChr - startingChr ) + 1, startingChr, readCount + DEFAULT_READ_LENGTH); + public static ArtificialSAMQueryIterator queryReadIterator(int startingChr, int endingChr, int readCount) { + SAMFileHeader header = createArtificialSamHeader((endingChr - startingChr) + 1, startingChr, readCount + DEFAULT_READ_LENGTH); return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, 0, header); } @@ -326,11 +315,10 @@ public class ArtificialSAMUtils { * @param endingChr the id to end with * @param readCount the number of reads per chromosome * @param unmappedReadCount the count of unmapped reads to place at the end of the iterator, like in a sorted bam file - * * @return StingSAMIterator representing the specified amount of fake data */ - public static StingSAMIterator queryReadIterator( int startingChr, int endingChr, int readCount, int unmappedReadCount ) { - SAMFileHeader header = createArtificialSamHeader(( endingChr - startingChr ) + 1, startingChr, readCount + DEFAULT_READ_LENGTH); + public static StingSAMIterator queryReadIterator(int startingChr, int endingChr, int readCount, int unmappedReadCount) { + SAMFileHeader header = createArtificialSamHeader((endingChr - startingChr) + 1, startingChr, readCount + DEFAULT_READ_LENGTH); return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, unmappedReadCount, header); } @@ -345,6 +333,7 @@ public class ArtificialSAMUtils { * reads created that have readLen bases. Pairs are sampled from a gaussian distribution with mean insert * size of insertSize and variation of insertSize / 10. The first read will be in the pileup, and the second * may be, depending on where this sampled insertSize puts it. + * * @param header * @param loc * @param readLen @@ -360,22 +349,22 @@ public class ArtificialSAMUtils { final int pos = loc.getStart(); final List pileupElements = new ArrayList(); - for ( int i = 0; i < pileupSize / 2; i++ ) { + for (int i = 0; i < pileupSize / 2; i++) { final String readName = "read" + i; final int leftStart = ranIntInclusive(ran, 1, pos); - final int fragmentSize = (int)(ran.nextGaussian() * insertSizeVariation + insertSize); + final int fragmentSize = (int) (ran.nextGaussian() * insertSizeVariation + insertSize); final int rightStart = leftStart + fragmentSize - readLen; - if ( rightStart <= 0 ) continue; + if (rightStart <= 0) continue; List pair = createPair(header, readName, readLen, leftStart, rightStart, leftIsFirst, leftIsNegative); final GATKSAMRecord left = pair.get(0); final GATKSAMRecord right = pair.get(1); - pileupElements.add(new PileupElement(left, pos - leftStart)); + pileupElements.add(new PileupElement(left, pos - leftStart, false)); - if ( pos >= right.getAlignmentStart() && pos <= right.getAlignmentEnd() ) { - pileupElements.add(new PileupElement(right, pos - rightStart)); + if (pos >= right.getAlignmentStart() && pos <= right.getAlignmentEnd()) { + pileupElements.add(new PileupElement(right, pos - rightStart, false)); } } diff --git a/public/java/test/org/broadinstitute/sting/BaseTest.java b/public/java/test/org/broadinstitute/sting/BaseTest.java index 61829dcfc..626b91cbf 100755 --- a/public/java/test/org/broadinstitute/sting/BaseTest.java +++ b/public/java/test/org/broadinstitute/sting/BaseTest.java @@ -1,13 +1,20 @@ package org.broadinstitute.sting; -import org.apache.log4j.*; +import org.apache.log4j.AppenderSkeleton; +import org.apache.log4j.Level; +import org.apache.log4j.Logger; +import org.apache.log4j.PatternLayout; import org.apache.log4j.spi.LoggingEvent; import org.broadinstitute.sting.commandline.CommandLineUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.io.IOUtils; -import java.io.*; -import java.util.*; +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; /** * diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 5cdf12f1b..e9b4fc211 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -28,7 +28,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("d61c7055bd09024abb8902bde6bd3960")); + Arrays.asList("653172b43b19003d9f7df6dab21f4b09")); executeTest("test MultiSample Pilot1", spec); } @@ -227,7 +227,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("b11df6587e4e16cb819d76a900446946")); + Arrays.asList("bd9d3d50a1f49605d7cd592a0f446899")); executeTest(String.format("test indel caller in SLX"), spec); } @@ -255,7 +255,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("59068bc8888ad5f08790946066d76602")); + Arrays.asList("91cd6d2e3972b0b8e4064bb35a33241f")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @@ -294,7 +294,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec( baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2_chr20_100_110K.20101123.indels.sites.vcf -I " + validationDataLocation + "phase1_GBR_realigned.chr20.100K-110K.bam -o %s -L 20:100,000-110,000", 1, - Arrays.asList("fcd590a55f5fec2a9b7e628187d6b8a8")); + Arrays.asList("877de5b0cc61dc54636062df6399b978")); executeTest("test MultiSample Phase1 indels with complicated records", spec4); } diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java index 367f6294d..1a8086a1b 100755 --- a/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java @@ -42,12 +42,12 @@ public class ReadUtilsUnitTest extends BaseTest { @Test public void testReducedReadPileupElement() { - PileupElement readp = new PileupElement(read, 0); - PileupElement reducedreadp = new PileupElement(reducedRead, 0); + PileupElement readp = new PileupElement(read, 0, false); + PileupElement reducedreadp = new PileupElement(reducedRead, 0, false); - Assert.assertFalse(readp.isReducedRead()); + Assert.assertFalse(readp.getRead().isReducedRead()); - Assert.assertTrue(reducedreadp.isReducedRead()); + Assert.assertTrue(reducedreadp.getRead().isReducedRead()); Assert.assertEquals(reducedreadp.getRepresentativeCount(), REDUCED_READ_COUNTS[0]); Assert.assertEquals(reducedreadp.getQual(), readp.getQual()); } From 97499529c73b9cb8b262bbada2945c14004992fd Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Tue, 24 Jan 2012 16:13:53 -0500 Subject: [PATCH 22/22] another small bug with the file extension. --- .../sting/queue/qscripts/PacbioProcessingPipeline.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala index d5f7512e4..c64eef7f7 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala @@ -62,12 +62,12 @@ class PacbioProcessingPipeline extends QScript { var USE_BWA: Boolean = false var resetQuals: Boolean = true - if (file.endsWith(".fasta") || file.endsWith(".fq")) { + if (file.endsWith(".fasta") || file.endsWith(".fq") || file.endsWith(".fastq")) { if (bwaPath == null) { throw new UserException("You provided a fasta/fastq file but didn't provide the path for BWA"); } USE_BWA = true - if (file.endsWith(".fq")) + if (file.endsWith(".fq") || file.endsWith(".fastq")) resetQuals = false }