diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.variantqc.utils.R b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.variantqc.utils.R index 19567e7e6..45dacd835 100644 --- a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.variantqc.utils.R +++ b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.variantqc.utils.R @@ -207,7 +207,7 @@ plotVariantQC <- function(metrics, measures, requestedStrat = "Sample", if ( requestedStrat == "Sample" ) { perSampleGraph <- perSampleGraph + geom_text(aes(label=strat), size=1.5) + geom_blank() # don't display a scale - perSampleGraph <- perSampleGraph + scale_x_discrete("Sample (ordered by nSNPs)", formatter=function(x) "") + perSampleGraph <- perSampleGraph + scale_x_discrete("Sample (ordered by nSNPs)") } else { # by AlleleCount perSampleGraph <- perSampleGraph + geom_point(aes(size=log10(nobs))) #+ geom_smooth(aes(weight=log10(nobs))) perSampleGraph <- perSampleGraph + scale_x_log10("AlleleCount") diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentDefinition.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentDefinition.java index a5647ec0f..618120217 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/ArgumentDefinition.java +++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentDefinition.java @@ -26,6 +26,7 @@ package org.broadinstitute.sting.commandline; import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.lang.annotation.Annotation; import java.util.List; @@ -147,6 +148,9 @@ public class ArgumentDefinition { this.exclusiveOf = exclusiveOf; this.validation = validation; this.validOptions = validOptions; + + validateName(shortName); + validateName(fullName); } /** @@ -192,6 +196,9 @@ public class ArgumentDefinition { else shortName = null; + validateName(shortName); + validateName(fullName); + this.ioType = ioType; this.argumentType = argumentType; this.fullName = fullName; @@ -277,4 +284,14 @@ public class ArgumentDefinition { String validation = (String)CommandLineUtils.getValue(annotation, "validation"); return validation.trim().length() > 0 ? validation.trim() : null; } + + /** + * Make sure the argument's name is valid + * + * @param name + */ + private void validateName(final String name) { + if ( name != null && name.startsWith("-") ) + throw new ReviewedStingException("Invalid argument definition: " + name + " begins with a -"); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java index b131e36c1..c02ae7d99 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java @@ -62,23 +62,24 @@ public class ReferenceDataSource { * @param fastaFile Fasta file to be used as reference */ public ReferenceDataSource(File fastaFile) { - // does the fasta file exist? check that first... if (!fastaFile.exists()) throw new UserException("The fasta file you specified (" + fastaFile.getAbsolutePath() + ") does not exist."); - File indexFile = new File(fastaFile.getAbsolutePath() + ".fai"); - File dictFile; - if (fastaFile.getAbsolutePath().endsWith("fa")) { - dictFile = new File(fastaFile.getAbsolutePath().replace(".fa", ".dict")); - } - else - dictFile = new File(fastaFile.getAbsolutePath().replace(".fasta", ".dict")); + final boolean isGzipped = fastaFile.getAbsolutePath().endsWith(".gz"); + + final File indexFile = new File(fastaFile.getAbsolutePath() + ".fai"); + + // determine the name for the dict file + final String fastaExt = (fastaFile.getAbsolutePath().endsWith("fa") ? ".fa" : ".fasta" ) + (isGzipped ? ".gz" : ""); + final File dictFile = new File(fastaFile.getAbsolutePath().replace(fastaExt, ".dict")); /* - if index file does not exist, create it manually - */ + * if index file does not exist, create it manually + */ if (!indexFile.exists()) { + if ( isGzipped ) throw new UserException.CouldNotCreateReferenceFAIorDictForGzippedRef(fastaFile); + logger.info(String.format("Index file %s does not exist. Trying to create it now.", indexFile.getAbsolutePath())); FSLockWithShared indexLock = new FSLockWithShared(indexFile,true); try { @@ -95,7 +96,7 @@ public class ReferenceDataSource { } catch(UserException e) { // Rethrow all user exceptions as-is; there should be more details in the UserException itself. - throw e; + throw e; } catch (Exception e) { // If lock creation succeeded, the failure must have been generating the index. @@ -114,6 +115,8 @@ public class ReferenceDataSource { * This has been filed in trac as (PIC-370) Want programmatic interface to CreateSequenceDictionary */ if (!dictFile.exists()) { + if ( isGzipped ) throw new UserException.CouldNotCreateReferenceFAIorDictForGzippedRef(fastaFile); + logger.info(String.format("Dict file %s does not exist. Trying to create it now.", dictFile.getAbsolutePath())); /* @@ -218,9 +221,9 @@ public class ReferenceDataSource { for(int shardStart = 1; shardStart <= refSequenceRecord.getSequenceLength(); shardStart += maxShardSize) { final int shardStop = Math.min(shardStart+maxShardSize-1, refSequenceRecord.getSequenceLength()); shards.add(new LocusShard(parser, - readsDataSource, - Collections.singletonList(parser.createGenomeLoc(refSequenceRecord.getSequenceName(),shardStart,shardStop)), - null)); + readsDataSource, + Collections.singletonList(parser.createGenomeLoc(refSequenceRecord.getSequenceName(),shardStart,shardStop)), + null)); } } return shards; diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/BadCigarFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/BadCigarFilter.java index 9a1455859..cda7392ae 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/filters/BadCigarFilter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/BadCigarFilter.java @@ -29,9 +29,19 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; +import java.util.Iterator; + /** * Filter out reads with wonky cigar strings. * + * - No reads with Hard/Soft clips in the middle of the cigar + * - No reads starting with deletions (with or without preceding clips) + * - No reads ending in deletions (with or without follow-up clips) + * - No reads that are fully hard or soft clipped + * - No reads that have consecutive indels in the cigar (II, DD, ID or DI) + * + * ps: apparently an empty cigar is okay... + * * @author ebanks * @version 0.1 */ @@ -40,28 +50,72 @@ public class BadCigarFilter extends ReadFilter { public boolean filterOut(final SAMRecord rec) { final Cigar c = rec.getCigar(); - if( c.isEmpty() ) { return false; } // if there is no Cigar then it can't be bad - boolean previousElementWasIndel = false; - CigarOperator lastOp = c.getCigarElement(0).getOperator(); - - if (lastOp == CigarOperator.D) // filter out reads starting with deletion - return true; - - for (CigarElement ce : c.getCigarElements()) { - CigarOperator op = ce.getOperator(); - if (op == CigarOperator.D || op == CigarOperator.I) { - if (previousElementWasIndel) - return true; // filter out reads with adjacent I/D - - previousElementWasIndel = true; - } - else // this is a regular base (match/mismatch/hard or soft clip) - previousElementWasIndel = false; // reset the previous element - - lastOp = op; + // if there is no Cigar then it can't be bad + if( c.isEmpty() ) { + return false; } - return lastOp == CigarOperator.D; + Iterator elementIterator = c.getCigarElements().iterator(); + + CigarOperator firstOp = CigarOperator.H; + while (elementIterator.hasNext() && (firstOp == CigarOperator.H || firstOp == CigarOperator.S)) { + CigarOperator op = elementIterator.next().getOperator(); + + // No reads with Hard/Soft clips in the middle of the cigar + if (firstOp != CigarOperator.H && op == CigarOperator.H) { + return true; + } + firstOp = op; + } + + // No reads starting with deletions (with or without preceding clips) + if (firstOp == CigarOperator.D) { + return true; + } + + boolean hasMeaningfulElements = (firstOp != CigarOperator.H && firstOp != CigarOperator.S); + boolean previousElementWasIndel = firstOp == CigarOperator.I; + CigarOperator lastOp = firstOp; + CigarOperator previousOp = firstOp; + + while (elementIterator.hasNext()) { + CigarOperator op = elementIterator.next().getOperator(); + + if (op != CigarOperator.S && op != CigarOperator.H) { + + // No reads with Hard/Soft clips in the middle of the cigar + if (previousOp == CigarOperator.S || previousOp == CigarOperator.H) + return true; + + lastOp = op; + + if (!hasMeaningfulElements && op.consumesReadBases()) { + hasMeaningfulElements = true; + } + + if (op == CigarOperator.I || op == CigarOperator.D) { + + // No reads that have consecutive indels in the cigar (II, DD, ID or DI) + if (previousElementWasIndel) { + return true; + } + previousElementWasIndel = true; + } + else { + previousElementWasIndel = false; + } + } + // No reads with Hard/Soft clips in the middle of the cigar + else if (op == CigarOperator.S && previousOp == CigarOperator.H) { + return true; + } + + previousOp = op; + } + + // No reads ending in deletions (with or without follow-up clips) + // No reads that are fully hard or soft clipped + return lastOp == CigarOperator.D || !hasMeaningfulElements; } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java index 1606c227d..f97069189 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java @@ -159,7 +159,7 @@ public class LocusIteratorByState extends LocusIterator { return stepForwardOnGenome(); } else { if (curElement != null && curElement.getOperator() == CigarOperator.D) - throw new UserException.MalformedBAM(read, "read ends with deletion. Cigar: " + read.getCigarString() + ". This is an indication of a malformed file, but the SAM spec allows reads ending in deletion. If you are sure you want to use this read, re-run your analysis with the extra option: -rf BadCigar"); + throw new UserException.MalformedBAM(read, "read ends with deletion. Cigar: " + read.getCigarString() + ". Although the SAM spec technically permits such reads, this is often indicative of malformed files. If you are sure you want to use this file, re-run your analysis with the extra option: -rf BadCigar"); // Reads that contain indels model the genomeOffset as the following base in the reference. Because // we fall into this else block only when indels end the read, increment genomeOffset such that the @@ -185,7 +185,7 @@ public class LocusIteratorByState extends LocusIterator { break; case D: // deletion w.r.t. the reference if (readOffset < 0) // we don't want reads starting with deletion, this is a malformed cigar string - throw new UserException.MalformedBAM(read, "Read starting with deletion. Cigar: " + read.getCigarString() + ". This is an indication of a malformed file, but the SAM spec allows reads starting in deletion. If you are sure you want to use this read, re-run your analysis with the extra option: -rf BadCigar"); + throw new UserException.MalformedBAM(read, "read starts with deletion. Cigar: " + read.getCigarString() + ". Although the SAM spec technically permits such reads, this is often indicative of malformed files. If you are sure you want to use this file, re-run your analysis with the extra option: -rf BadCigar"); // should be the same as N case genomeOffset++; done = true; diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java index cb094d29b..d29e9a5f2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java @@ -75,8 +75,6 @@ public class TraverseReads extends TraversalEngine,Read if( !dataProvider.hasReads() ) throw new IllegalArgumentException("Unable to traverse reads; no read data is available."); - boolean needsReferenceBasesP = WalkerManager.isRequired(walker, DataSource.REFERENCE_BASES); - ReadView reads = new ReadView(dataProvider); ReadReferenceView reference = new ReadReferenceView(dataProvider); @@ -91,7 +89,7 @@ public class TraverseReads extends TraversalEngine,Read ReferenceContext refContext = null; // get the array of characters for the reference sequence, since we're a mapped read - if (needsReferenceBasesP && !read.getReadUnmappedFlag() && dataProvider.hasReference()) + if (!read.getReadUnmappedFlag() && dataProvider.hasReference()) refContext = reference.getReferenceContext(read); // update the number of reads we've seen diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java index aba508b3e..cbe791353 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java @@ -28,7 +28,7 @@ import java.util.List; */ @By(DataSource.READS) -@Requires({DataSource.READS, DataSource.REFERENCE_BASES}) +@Requires({DataSource.READS, DataSource.REFERENCE}) @PartitionBy(PartitionType.READ) @ActiveRegionExtension(extension=50,maxRegion=1500) @ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, MappingQualityUnavailableFilter.class}) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/DataSource.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/DataSource.java index a152ab137..1f93c67a6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/DataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/DataSource.java @@ -16,8 +16,18 @@ package org.broadinstitute.sting.gatk.walkers; * Allow user to choose between a number of different data sources. */ public enum DataSource { + /** + * Does this walker require read (BAM) data to work? + */ READS, + + /** + * Does this walker require reference data to work? + */ REFERENCE, - REFERENCE_BASES, // Do I actually need the reference bases passed to the walker? + + /** + * Does this walker require reference order data (VCF) to work? + */ REFERENCE_ORDERED_DATA } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java index 2a92d8831..3b18dda44 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java @@ -16,7 +16,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; * To change this template use File | Settings | File Templates. */ @By(DataSource.READS) -@Requires({DataSource.READS,DataSource.REFERENCE, DataSource.REFERENCE_BASES}) +@Requires({DataSource.READS,DataSource.REFERENCE}) @PartitionBy(PartitionType.LOCUS) @ReadFilters({UnmappedReadFilter.class,NotPrimaryAlignmentFilter.class,DuplicateReadFilter.class,FailsVendorQualityCheckFilter.class}) @RemoveProgramRecords diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ReadWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ReadWalker.java index 8933bd73e..77e3af93f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ReadWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ReadWalker.java @@ -12,7 +12,7 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; * Time: 2:52:28 PM * To change this template use File | Settings | File Templates. */ -@Requires({DataSource.READS, DataSource.REFERENCE_BASES}) +@Requires({DataSource.READS, DataSource.REFERENCE}) @PartitionBy(PartitionType.READ) public abstract class ReadWalker extends Walker { public boolean requiresOrderedReads() { return false; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/RefWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/RefWalker.java index 1d3debb48..45bd14d4e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/RefWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/RefWalker.java @@ -8,7 +8,7 @@ package org.broadinstitute.sting.gatk.walkers; * To change this template use File | Settings | File Templates. */ @By(DataSource.REFERENCE) -@Requires({DataSource.REFERENCE, DataSource.REFERENCE_BASES}) +@Requires({DataSource.REFERENCE}) @Allows(DataSource.REFERENCE) public abstract class RefWalker extends LocusWalker { } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java index 3f35cf8e8..e45cad971 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java @@ -107,7 +107,7 @@ import java.util.ArrayList; @BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN) @By(DataSource.READS) @ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class}) // only look at covered loci, not every loci of the reference file -@Requires({DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES}) // filter out all reads with zero or unavailable mapping quality +@Requires({DataSource.READS, DataSource.REFERENCE}) // filter out all reads with zero or unavailable mapping quality @PartitionBy(PartitionType.LOCUS) // this walker requires both -I input.bam and -R reference.fasta public class BaseRecalibrator extends LocusWalker implements TreeReducible { @ArgumentCollection diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 6eb9774ea..cda0a14c9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -474,10 +474,17 @@ public class UnifiedGenotyperEngine { // add the MLE AC and AF annotations if ( alleleCountsofMLE.size() > 0 ) { attributes.put(VCFConstants.MLE_ALLELE_COUNT_KEY, alleleCountsofMLE); - final double AN = (double)builder.make().getCalledChrCount(); + final int AN = builder.make().getCalledChrCount(); + + // let's sanity check that we don't have an invalid MLE value in there + for ( int MLEAC : alleleCountsofMLE ) { + if ( MLEAC > AN ) + throw new ReviewedStingException(String.format("MLEAC value (%d) is larger than AN (%d) at position %s:%d", MLEAC, AN, loc.getContig(), loc.getStart())); + } + final ArrayList MLEfrequencies = new ArrayList(alleleCountsofMLE.size()); for ( int AC : alleleCountsofMLE ) - MLEfrequencies.add((double)AC / AN); + MLEfrequencies.add((double)AC / (double)AN); attributes.put(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, MLEfrequencies); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java index 0dcafb30a..bbd4bf92f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java @@ -645,7 +645,7 @@ public class PhaseByTransmission extends RodWalker, HashMa bestChildGenotype.clear(); bestChildGenotype.add(childGenotype.getKey()); } - else if(MathUtils.compareDoubles(configurationLikelihood, bestConfigurationLikelihood) == 0) { + else if(configurationLikelihood == bestConfigurationLikelihood) { bestFirstParentGenotype.add(firstParentGenotype.getKey()); bestSecondParentGenotype.add(secondParentGenotype.getKey()); bestChildGenotype.add(childGenotype.getKey()); diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java index fc0b3c4a9..60fcb6585 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java @@ -118,6 +118,7 @@ public final class BCF2Codec implements FeatureCodec { final int sitesBlockSize = decoder.readBlockSize(inputStream); final int genotypeBlockSize = decoder.readBlockSize(inputStream); + decoder.readNextBlock(sitesBlockSize, inputStream); decodeSiteLoc(builder); final SitesInfoForDecoding info = decodeSitesExtendedInfo(builder); diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java index d7f59632c..05ba2aa1f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java @@ -82,7 +82,7 @@ public final class BCF2Decoder { public void skipNextBlock(final int blockSizeInBytes, final InputStream stream) { try { final int bytesRead = (int)stream.skip(blockSizeInBytes); - validateReadBytes(bytesRead, blockSizeInBytes); + validateReadBytes(bytesRead, 1, blockSizeInBytes); } catch ( IOException e ) { throw new UserException.CouldNotReadInputFile("I/O error while reading BCF2 file", e); } @@ -316,17 +316,37 @@ public final class BCF2Decoder { } /** + * Read all bytes for a BCF record block into a byte[], and return it * - * @param inputStream - * @return + * Is smart about reading from the stream multiple times to fill the buffer, if necessary + * + * @param blockSizeInBytes number of bytes to read + * @param inputStream the stream to read from + * @return a non-null byte[] containing exactly blockSizeInBytes bytes from the inputStream */ - private final static byte[] readRecordBytes(final int blockSizeInBytes, final InputStream inputStream) { + @Requires({"blockSizeInBytes >= 0", "inputStream != null"}) + @Ensures("result != null") + private static byte[] readRecordBytes(final int blockSizeInBytes, final InputStream inputStream) { assert blockSizeInBytes >= 0; final byte[] record = new byte[blockSizeInBytes]; try { - final int bytesRead = inputStream.read(record); - validateReadBytes(bytesRead, blockSizeInBytes); + int bytesRead = 0; + int nReadAttempts = 0; // keep track of how many times we've read + + // because we might not read enough bytes from the file in a single go, do it in a loop until we get EOF + while ( bytesRead < blockSizeInBytes ) { + final int read1 = inputStream.read(record, bytesRead, blockSizeInBytes - bytesRead); + if ( read1 == -1 ) + validateReadBytes(bytesRead, nReadAttempts, blockSizeInBytes); + else + bytesRead += read1; + } + + if ( nReadAttempts > 1 ) // TODO -- remove me + logger.warn("Required multiple read attempts to actually get the entire BCF2 block, unexpected behavior"); + + validateReadBytes(bytesRead, nReadAttempts, blockSizeInBytes); } catch ( IOException e ) { throw new UserException.CouldNotReadInputFile("I/O error while reading BCF2 file", e); } @@ -334,14 +354,20 @@ public final class BCF2Decoder { return record; } - private final static void validateReadBytes(final int actuallyRead, final int expected) { + /** + * Make sure we read the right number of bytes, or throw an error + * + * @param actuallyRead + * @param nReadAttempts + * @param expected + */ + private static void validateReadBytes(final int actuallyRead, final int nReadAttempts, final int expected) { assert expected >= 0; if ( actuallyRead < expected ) { - throw new UserException.MalformedBCF2(String.format("Failed to read next complete record: %s", - actuallyRead == -1 ? - "premature end of input stream" : - String.format("expected %d bytes but read only %d", expected, actuallyRead))); + throw new UserException.MalformedBCF2( + String.format("Failed to read next complete record: expected %d bytes but read only %d after %d iterations", + expected, actuallyRead, nReadAttempts)); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java index bda03f675..3130469e5 100755 --- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java +++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java @@ -340,6 +340,17 @@ public class UserException extends ReviewedStingException { } } + public static class CouldNotCreateReferenceFAIorDictForGzippedRef extends UserException { + public CouldNotCreateReferenceFAIorDictForGzippedRef(final File f) { + super("Although the GATK can process .gz reference sequences, it currently cannot create FAI " + + "or DICT files for them. In order to use the GATK with reference.fasta.gz you will need to " + + "create .dict and .fai files for reference.fasta.gz and name them reference.fasta.gz.fai and " + + "reference.dict. Potentially the easiest way to do this is to uncompress reference.fasta, " + + "run the GATK to create the .dict and .fai files, and copy them to the appropriate location. " + + "Sorry for the inconvenience."); + } + } + public static class CouldNotCreateReferenceIndexFileBecauseOfLock extends UserException.CouldNotCreateReferenceIndexFile { public CouldNotCreateReferenceIndexFileBecauseOfLock(File f) { super(f, "could not be written because an exclusive file lock could not be obtained. " + diff --git a/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java b/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java index 44b586bcd..48706543a 100644 --- a/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java +++ b/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java @@ -41,6 +41,8 @@ import java.util.Arrays; * Thread-safe! Uses a lock object to protect write and access to the cache. */ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { + protected static final org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger(CachingIndexedFastaSequenceFile.class); + /** global enable flag */ private static final boolean USE_CACHE = true; @@ -125,7 +127,7 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { public void printEfficiency() { // comment out to disable tracking if ( (cacheHits + cacheMisses) % PRINT_FREQUENCY == 0 ) { - System.out.printf("### CachingIndexedFastaReader: hits=%d misses=%d efficiency %.6f%%%n", cacheHits, cacheMisses, calcEfficiency()); + logger.info(String.format("### CachingIndexedFastaReader: hits=%d misses=%d efficiency %.6f%%%n", cacheHits, cacheMisses, calcEfficiency())); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/filters/BadCigarFilterUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/filters/BadCigarFilterUnitTest.java index 333d35641..ff918db68 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/filters/BadCigarFilterUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/filters/BadCigarFilterUnitTest.java @@ -1,11 +1,14 @@ package org.broadinstitute.sting.gatk.filters; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import net.sf.samtools.Cigar; +import org.broadinstitute.sting.utils.clipping.ReadClipperTestUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; +import java.util.List; + /** * Checks that the Bad Cigar filter works for all kinds of wonky cigars * @@ -14,6 +17,29 @@ import org.testng.annotations.Test; */ public class BadCigarFilterUnitTest { + public static final String[] BAD_CIGAR_LIST = { + "2D4M", // starting with multiple deletions + "4M2D", // ending with multiple deletions + "3M1I1D", // adjacent indels AND ends in deletion + "1M1I1D2M", // adjacent indels I->D + "1M1D2I1M", // adjacent indels D->I + "1M1I2M1D", // ends in single deletion with insertion in the middle + "4M1D", // ends in single deletion + "1D4M", // starts with single deletion + "2M1D1D2M", // adjacent D's + "1M1I1I1M", // adjacent I's + "1H1D4M", // starting with deletion after H + "1S1D3M", // starting with deletion after S + "1H1S1D3M", // starting with deletion after HS + "4M1D1H", // ending with deletion before H + "3M1D1S", // ending with deletion before S + "3M1D1S1H", // ending with deletion before HS + "10M2H10M", // H in the middle + "10M2S10M", // S in the middle + "1H1S10M2S10M1S1H", // deceiving S in the middle + "1H1S10M2H10M1S1H" // deceiving H in the middle + }; + BadCigarFilter filter; @BeforeClass @@ -21,40 +47,20 @@ public class BadCigarFilterUnitTest { filter = new BadCigarFilter(); } - @Test + @Test(enabled = true) public void testWonkyCigars () { - byte[] bases = {'A', 'A', 'A', 'A'}; - byte[] quals = {30, 30, 30, 30}; - GATKSAMRecord read; - // starting with multiple deletions - read = ArtificialSAMUtils.createArtificialRead(bases, quals, "2D4M"); - Assert.assertTrue(filter.filterOut(read), read.getCigarString()); + for (String cigarString : BAD_CIGAR_LIST) { + GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigarString); + Assert.assertTrue(filter.filterOut(read), read.getCigarString()); + } + } - read = ArtificialSAMUtils.createArtificialRead(bases, quals, "4M2D"); // ending with multiple deletions - Assert.assertTrue(filter.filterOut(read), read.getCigarString()); - - read = ArtificialSAMUtils.createArtificialRead(bases, quals, "3M1I1D"); // adjacent indels AND ends in deletion - Assert.assertTrue(filter.filterOut(read), read.getCigarString()); - - read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1I1D2M"); // adjacent indels I->D - Assert.assertTrue(filter.filterOut(read), read.getCigarString()); - - read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1D2I1M"); // adjacent indels D->I - Assert.assertTrue(filter.filterOut(read), read.getCigarString()); - - read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1I2M1D"); // ends in single deletion with insertion in the middle - Assert.assertTrue(filter.filterOut(read), read.getCigarString()); - - read = ArtificialSAMUtils.createArtificialRead(bases, quals, "4M1D"); // ends in single deletion - Assert.assertTrue(filter.filterOut(read), read.getCigarString()); - - read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1D4M"); // starts with single deletion - Assert.assertTrue(filter.filterOut(read), read.getCigarString()); - - read = ArtificialSAMUtils.createArtificialRead(bases, quals, "2M1D1D2M"); // adjacent D's - Assert.assertTrue(filter.filterOut(read), read.getCigarString()); - - read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1I1I1M"); // adjacent I's - Assert.assertTrue(filter.filterOut(read), read.getCigarString()); + @Test(enabled = true) + public void testGoodCigars() { + List cigarList = ReadClipperTestUtils.generateCigarList(10); + for (Cigar cigar : cigarList) { + GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); + Assert.assertFalse(filter.filterOut(read), read.getCigarString()); + } } } diff --git a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperTestUtils.java b/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperTestUtils.java index baa2f6218..208c14fbd 100644 --- a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperTestUtils.java +++ b/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperTestUtils.java @@ -4,6 +4,7 @@ import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.Assert; @@ -37,17 +38,22 @@ public class ReadClipperTestUtils { return ArtificialSAMUtils.createArtificialRead(Utils.arrayFromArrayWithLength(BASES, cigar.getReadLength()), Utils.arrayFromArrayWithLength(QUALS, cigar.getReadLength()), cigar.toString()); } - /** - * This function generates every valid permutation of cigar strings with a given length. - * - * A valid cigar object obeys the following rules: - * - No Hard/Soft clips in the middle of the read - * - No deletions in the beginning / end of the read - * - No repeated adjacent element (e.g. 1M2M -> this should be 3M) - * - * @param maximumLength the maximum number of elements in the cigar - * @return a list with all valid Cigar objects - */ + public static GATKSAMRecord makeReadFromCigar(String cigarString) { + return makeReadFromCigar(cigarFromString(cigarString)); + } + + /** + * This function generates every valid permutation of cigar strings with a given length. + * + * A valid cigar object obeys the following rules: + * - No Hard/Soft clips in the middle of the read + * - No deletions in the beginning / end of the read + * - No repeated adjacent element (e.g. 1M2M -> this should be 3M) + * - No consecutive I/D elements + * + * @param maximumLength the maximum number of elements in the cigar + * @return a list with all valid Cigar objects + */ public static List generateCigarList(int maximumLength) { int numCigarElements = cigarElements.length; LinkedList cigarList = new LinkedList(); @@ -137,7 +143,10 @@ public class ReadClipperTestUtils { CigarElement lastElement = null; int lastElementLength = 0; for (CigarElement cigarElement : rawCigar.getCigarElements()) { - if (lastElement != null && lastElement.getOperator() == cigarElement.getOperator()) + if (lastElement != null && + ((lastElement.getOperator() == cigarElement.getOperator()) || + (lastElement.getOperator() == CigarOperator.I && cigarElement.getOperator() == CigarOperator.D) || + (lastElement.getOperator() == CigarOperator.D && cigarElement.getOperator() == CigarOperator.I))) lastElementLength += cigarElement.getLength(); else { @@ -191,7 +200,7 @@ public class ReadClipperTestUtils { /** * Checks whether or not the read has any cigar element that is not H or S * - * @param read + * @param read the read * @return true if it has any M, I or D, false otherwise */ public static boolean readHasNonClippedBases(GATKSAMRecord read) { @@ -201,5 +210,79 @@ public class ReadClipperTestUtils { return false; } + public static Cigar cigarFromString(String cigarString) { + Cigar cigar = new Cigar(); + + boolean isNumber = false; + int number = 0; + for (int i = 0; i < cigarString.length(); i++) { + char x = cigarString.charAt(i); + + if (x >= '0' && x <='9') { + if (isNumber) { + number *= 10; + } + else { + isNumber = true; + } + number += x - '0'; + } + + else { + CigarElement e; + switch (x) { + case 'M': + case 'm': + e = new CigarElement(number, CigarOperator.M); + break; + + case 'I': + case 'i': + e = new CigarElement(number, CigarOperator.I); + break; + + case 'D': + case 'd': + e = new CigarElement(number, CigarOperator.D); + break; + + case 'S': + case 's': + e = new CigarElement(number, CigarOperator.S); + break; + + case 'N': + case 'n': + e = new CigarElement(number, CigarOperator.N); + break; + + case 'H': + case 'h': + e = new CigarElement(number, CigarOperator.H); + break; + + case 'P': + case 'p': + e = new CigarElement(number, CigarOperator.P); + break; + + case '=': + e = new CigarElement(number, CigarOperator.EQ); + break; + + case 'X': + case 'x': + e = new CigarElement(number, CigarOperator.X); + break; + + default: + throw new ReviewedStingException("Unrecognized cigar operator: " + x + " (number: " + number + ")"); + } + cigar.add(e); + } + } + return cigar; + } + } diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala index de996d187..2fbea1497 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala @@ -35,7 +35,7 @@ import org.broadinstitute.sting.queue.engine.{RunnerStatus, CommandLineJobRunner import java.util.regex.Pattern import java.lang.StringBuffer import java.util.Date -import com.sun.jna.{Structure, StringArray, NativeLong} +import com.sun.jna.{Pointer, Structure, StringArray, NativeLong} import com.sun.jna.ptr.IntByReference /** @@ -295,9 +295,17 @@ object Lsf706JobRunner extends Logging { // the platform LSF startTimes are in seconds, not milliseconds, so convert to the java convention runner.getRunInfo.startTime = new Date(jobInfo.startTime.longValue * 1000) runner.getRunInfo.doneTime = new Date(jobInfo.endTime.longValue * 1000) - val exHostsRaw = jobInfo.exHosts.getStringArray(0) - //logger.warn("exHostsRaw = " + exHostsRaw) - val exHostsList = exHostsRaw.toSeq + + val exHostsList = + if (jobInfo.numExHosts != 1) { + // this is necessary because + val exHostsString = "multipleHosts_" + jobInfo.numExHosts + logger.debug("numExHosts = " + jobInfo.numExHosts + " != 1 for job " + runner.jobId + ", cannot safely get exhosts, setting to " + exHostsString) + List(exHostsString) + } else { + jobInfo.exHosts.getStringArray(0).toSeq + } + //logger.warn("exHostsList = " + exHostsList) val exHosts = exHostsList.reduceLeft(_ + "," + _) //logger.warn("exHosts = " + exHosts)