Merge branch 'master' of ssh://gsa4.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

This commit is contained in:
Guillermo del Angel 2012-08-19 09:23:21 -04:00
commit d9641e3d57
21 changed files with 338 additions and 112 deletions

View File

@ -207,7 +207,7 @@ plotVariantQC <- function(metrics, measures, requestedStrat = "Sample",
if ( requestedStrat == "Sample" ) {
perSampleGraph <- perSampleGraph + geom_text(aes(label=strat), size=1.5) + geom_blank() # don't display a scale
perSampleGraph <- perSampleGraph + scale_x_discrete("Sample (ordered by nSNPs)", formatter=function(x) "")
perSampleGraph <- perSampleGraph + scale_x_discrete("Sample (ordered by nSNPs)")
} else { # by AlleleCount
perSampleGraph <- perSampleGraph + geom_point(aes(size=log10(nobs))) #+ geom_smooth(aes(weight=log10(nobs)))
perSampleGraph <- perSampleGraph + scale_x_log10("AlleleCount")

View File

@ -26,6 +26,7 @@
package org.broadinstitute.sting.commandline;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.lang.annotation.Annotation;
import java.util.List;
@ -147,6 +148,9 @@ public class ArgumentDefinition {
this.exclusiveOf = exclusiveOf;
this.validation = validation;
this.validOptions = validOptions;
validateName(shortName);
validateName(fullName);
}
/**
@ -192,6 +196,9 @@ public class ArgumentDefinition {
else
shortName = null;
validateName(shortName);
validateName(fullName);
this.ioType = ioType;
this.argumentType = argumentType;
this.fullName = fullName;
@ -277,4 +284,14 @@ public class ArgumentDefinition {
String validation = (String)CommandLineUtils.getValue(annotation, "validation");
return validation.trim().length() > 0 ? validation.trim() : null;
}
/**
* Make sure the argument's name is valid
*
* @param name
*/
private void validateName(final String name) {
if ( name != null && name.startsWith("-") )
throw new ReviewedStingException("Invalid argument definition: " + name + " begins with a -");
}
}

View File

@ -62,23 +62,24 @@ public class ReferenceDataSource {
* @param fastaFile Fasta file to be used as reference
*/
public ReferenceDataSource(File fastaFile) {
// does the fasta file exist? check that first...
if (!fastaFile.exists())
throw new UserException("The fasta file you specified (" + fastaFile.getAbsolutePath() + ") does not exist.");
File indexFile = new File(fastaFile.getAbsolutePath() + ".fai");
File dictFile;
if (fastaFile.getAbsolutePath().endsWith("fa")) {
dictFile = new File(fastaFile.getAbsolutePath().replace(".fa", ".dict"));
}
else
dictFile = new File(fastaFile.getAbsolutePath().replace(".fasta", ".dict"));
final boolean isGzipped = fastaFile.getAbsolutePath().endsWith(".gz");
final File indexFile = new File(fastaFile.getAbsolutePath() + ".fai");
// determine the name for the dict file
final String fastaExt = (fastaFile.getAbsolutePath().endsWith("fa") ? ".fa" : ".fasta" ) + (isGzipped ? ".gz" : "");
final File dictFile = new File(fastaFile.getAbsolutePath().replace(fastaExt, ".dict"));
/*
if index file does not exist, create it manually
*/
* if index file does not exist, create it manually
*/
if (!indexFile.exists()) {
if ( isGzipped ) throw new UserException.CouldNotCreateReferenceFAIorDictForGzippedRef(fastaFile);
logger.info(String.format("Index file %s does not exist. Trying to create it now.", indexFile.getAbsolutePath()));
FSLockWithShared indexLock = new FSLockWithShared(indexFile,true);
try {
@ -95,7 +96,7 @@ public class ReferenceDataSource {
}
catch(UserException e) {
// Rethrow all user exceptions as-is; there should be more details in the UserException itself.
throw e;
throw e;
}
catch (Exception e) {
// If lock creation succeeded, the failure must have been generating the index.
@ -114,6 +115,8 @@ public class ReferenceDataSource {
* This has been filed in trac as (PIC-370) Want programmatic interface to CreateSequenceDictionary
*/
if (!dictFile.exists()) {
if ( isGzipped ) throw new UserException.CouldNotCreateReferenceFAIorDictForGzippedRef(fastaFile);
logger.info(String.format("Dict file %s does not exist. Trying to create it now.", dictFile.getAbsolutePath()));
/*
@ -218,9 +221,9 @@ public class ReferenceDataSource {
for(int shardStart = 1; shardStart <= refSequenceRecord.getSequenceLength(); shardStart += maxShardSize) {
final int shardStop = Math.min(shardStart+maxShardSize-1, refSequenceRecord.getSequenceLength());
shards.add(new LocusShard(parser,
readsDataSource,
Collections.singletonList(parser.createGenomeLoc(refSequenceRecord.getSequenceName(),shardStart,shardStop)),
null));
readsDataSource,
Collections.singletonList(parser.createGenomeLoc(refSequenceRecord.getSequenceName(),shardStart,shardStop)),
null));
}
}
return shards;

View File

@ -29,9 +29,19 @@ import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMRecord;
import java.util.Iterator;
/**
* Filter out reads with wonky cigar strings.
*
* - No reads with Hard/Soft clips in the middle of the cigar
* - No reads starting with deletions (with or without preceding clips)
* - No reads ending in deletions (with or without follow-up clips)
* - No reads that are fully hard or soft clipped
* - No reads that have consecutive indels in the cigar (II, DD, ID or DI)
*
* ps: apparently an empty cigar is okay...
*
* @author ebanks
* @version 0.1
*/
@ -40,28 +50,72 @@ public class BadCigarFilter extends ReadFilter {
public boolean filterOut(final SAMRecord rec) {
final Cigar c = rec.getCigar();
if( c.isEmpty() ) { return false; } // if there is no Cigar then it can't be bad
boolean previousElementWasIndel = false;
CigarOperator lastOp = c.getCigarElement(0).getOperator();
if (lastOp == CigarOperator.D) // filter out reads starting with deletion
return true;
for (CigarElement ce : c.getCigarElements()) {
CigarOperator op = ce.getOperator();
if (op == CigarOperator.D || op == CigarOperator.I) {
if (previousElementWasIndel)
return true; // filter out reads with adjacent I/D
previousElementWasIndel = true;
}
else // this is a regular base (match/mismatch/hard or soft clip)
previousElementWasIndel = false; // reset the previous element
lastOp = op;
// if there is no Cigar then it can't be bad
if( c.isEmpty() ) {
return false;
}
return lastOp == CigarOperator.D;
Iterator<CigarElement> elementIterator = c.getCigarElements().iterator();
CigarOperator firstOp = CigarOperator.H;
while (elementIterator.hasNext() && (firstOp == CigarOperator.H || firstOp == CigarOperator.S)) {
CigarOperator op = elementIterator.next().getOperator();
// No reads with Hard/Soft clips in the middle of the cigar
if (firstOp != CigarOperator.H && op == CigarOperator.H) {
return true;
}
firstOp = op;
}
// No reads starting with deletions (with or without preceding clips)
if (firstOp == CigarOperator.D) {
return true;
}
boolean hasMeaningfulElements = (firstOp != CigarOperator.H && firstOp != CigarOperator.S);
boolean previousElementWasIndel = firstOp == CigarOperator.I;
CigarOperator lastOp = firstOp;
CigarOperator previousOp = firstOp;
while (elementIterator.hasNext()) {
CigarOperator op = elementIterator.next().getOperator();
if (op != CigarOperator.S && op != CigarOperator.H) {
// No reads with Hard/Soft clips in the middle of the cigar
if (previousOp == CigarOperator.S || previousOp == CigarOperator.H)
return true;
lastOp = op;
if (!hasMeaningfulElements && op.consumesReadBases()) {
hasMeaningfulElements = true;
}
if (op == CigarOperator.I || op == CigarOperator.D) {
// No reads that have consecutive indels in the cigar (II, DD, ID or DI)
if (previousElementWasIndel) {
return true;
}
previousElementWasIndel = true;
}
else {
previousElementWasIndel = false;
}
}
// No reads with Hard/Soft clips in the middle of the cigar
else if (op == CigarOperator.S && previousOp == CigarOperator.H) {
return true;
}
previousOp = op;
}
// No reads ending in deletions (with or without follow-up clips)
// No reads that are fully hard or soft clipped
return lastOp == CigarOperator.D || !hasMeaningfulElements;
}
}

View File

@ -159,7 +159,7 @@ public class LocusIteratorByState extends LocusIterator {
return stepForwardOnGenome();
} else {
if (curElement != null && curElement.getOperator() == CigarOperator.D)
throw new UserException.MalformedBAM(read, "read ends with deletion. Cigar: " + read.getCigarString() + ". This is an indication of a malformed file, but the SAM spec allows reads ending in deletion. If you are sure you want to use this read, re-run your analysis with the extra option: -rf BadCigar");
throw new UserException.MalformedBAM(read, "read ends with deletion. Cigar: " + read.getCigarString() + ". Although the SAM spec technically permits such reads, this is often indicative of malformed files. If you are sure you want to use this file, re-run your analysis with the extra option: -rf BadCigar");
// Reads that contain indels model the genomeOffset as the following base in the reference. Because
// we fall into this else block only when indels end the read, increment genomeOffset such that the
@ -185,7 +185,7 @@ public class LocusIteratorByState extends LocusIterator {
break;
case D: // deletion w.r.t. the reference
if (readOffset < 0) // we don't want reads starting with deletion, this is a malformed cigar string
throw new UserException.MalformedBAM(read, "Read starting with deletion. Cigar: " + read.getCigarString() + ". This is an indication of a malformed file, but the SAM spec allows reads starting in deletion. If you are sure you want to use this read, re-run your analysis with the extra option: -rf BadCigar");
throw new UserException.MalformedBAM(read, "read starts with deletion. Cigar: " + read.getCigarString() + ". Although the SAM spec technically permits such reads, this is often indicative of malformed files. If you are sure you want to use this file, re-run your analysis with the extra option: -rf BadCigar");
// should be the same as N case
genomeOffset++;
done = true;

View File

@ -75,8 +75,6 @@ public class TraverseReads<M,T> extends TraversalEngine<M,T,ReadWalker<M,T>,Read
if( !dataProvider.hasReads() )
throw new IllegalArgumentException("Unable to traverse reads; no read data is available.");
boolean needsReferenceBasesP = WalkerManager.isRequired(walker, DataSource.REFERENCE_BASES);
ReadView reads = new ReadView(dataProvider);
ReadReferenceView reference = new ReadReferenceView(dataProvider);
@ -91,7 +89,7 @@ public class TraverseReads<M,T> extends TraversalEngine<M,T,ReadWalker<M,T>,Read
ReferenceContext refContext = null;
// get the array of characters for the reference sequence, since we're a mapped read
if (needsReferenceBasesP && !read.getReadUnmappedFlag() && dataProvider.hasReference())
if (!read.getReadUnmappedFlag() && dataProvider.hasReference())
refContext = reference.getReferenceContext(read);
// update the number of reads we've seen

View File

@ -28,7 +28,7 @@ import java.util.List;
*/
@By(DataSource.READS)
@Requires({DataSource.READS, DataSource.REFERENCE_BASES})
@Requires({DataSource.READS, DataSource.REFERENCE})
@PartitionBy(PartitionType.READ)
@ActiveRegionExtension(extension=50,maxRegion=1500)
@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, MappingQualityUnavailableFilter.class})

View File

@ -16,8 +16,18 @@ package org.broadinstitute.sting.gatk.walkers;
* Allow user to choose between a number of different data sources.
*/
public enum DataSource {
/**
* Does this walker require read (BAM) data to work?
*/
READS,
/**
* Does this walker require reference data to work?
*/
REFERENCE,
REFERENCE_BASES, // Do I actually need the reference bases passed to the walker?
/**
* Does this walker require reference order data (VCF) to work?
*/
REFERENCE_ORDERED_DATA
}

View File

@ -16,7 +16,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
* To change this template use File | Settings | File Templates.
*/
@By(DataSource.READS)
@Requires({DataSource.READS,DataSource.REFERENCE, DataSource.REFERENCE_BASES})
@Requires({DataSource.READS,DataSource.REFERENCE})
@PartitionBy(PartitionType.LOCUS)
@ReadFilters({UnmappedReadFilter.class,NotPrimaryAlignmentFilter.class,DuplicateReadFilter.class,FailsVendorQualityCheckFilter.class})
@RemoveProgramRecords

View File

@ -12,7 +12,7 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
* Time: 2:52:28 PM
* To change this template use File | Settings | File Templates.
*/
@Requires({DataSource.READS, DataSource.REFERENCE_BASES})
@Requires({DataSource.READS, DataSource.REFERENCE})
@PartitionBy(PartitionType.READ)
public abstract class ReadWalker<MapType, ReduceType> extends Walker<MapType, ReduceType> {
public boolean requiresOrderedReads() { return false; }

View File

@ -8,7 +8,7 @@ package org.broadinstitute.sting.gatk.walkers;
* To change this template use File | Settings | File Templates.
*/
@By(DataSource.REFERENCE)
@Requires({DataSource.REFERENCE, DataSource.REFERENCE_BASES})
@Requires({DataSource.REFERENCE})
@Allows(DataSource.REFERENCE)
public abstract class RefWalker<MapType, ReduceType> extends LocusWalker<MapType, ReduceType> {
}

View File

@ -107,7 +107,7 @@ import java.util.ArrayList;
@BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN)
@By(DataSource.READS)
@ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class}) // only look at covered loci, not every loci of the reference file
@Requires({DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES}) // filter out all reads with zero or unavailable mapping quality
@Requires({DataSource.READS, DataSource.REFERENCE}) // filter out all reads with zero or unavailable mapping quality
@PartitionBy(PartitionType.LOCUS) // this walker requires both -I input.bam and -R reference.fasta
public class BaseRecalibrator extends LocusWalker<Long, Long> implements TreeReducible<Long> {
@ArgumentCollection

View File

@ -474,10 +474,17 @@ public class UnifiedGenotyperEngine {
// add the MLE AC and AF annotations
if ( alleleCountsofMLE.size() > 0 ) {
attributes.put(VCFConstants.MLE_ALLELE_COUNT_KEY, alleleCountsofMLE);
final double AN = (double)builder.make().getCalledChrCount();
final int AN = builder.make().getCalledChrCount();
// let's sanity check that we don't have an invalid MLE value in there
for ( int MLEAC : alleleCountsofMLE ) {
if ( MLEAC > AN )
throw new ReviewedStingException(String.format("MLEAC value (%d) is larger than AN (%d) at position %s:%d", MLEAC, AN, loc.getContig(), loc.getStart()));
}
final ArrayList<Double> MLEfrequencies = new ArrayList<Double>(alleleCountsofMLE.size());
for ( int AC : alleleCountsofMLE )
MLEfrequencies.add((double)AC / AN);
MLEfrequencies.add((double)AC / (double)AN);
attributes.put(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, MLEfrequencies);
}

View File

@ -645,7 +645,7 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
bestChildGenotype.clear();
bestChildGenotype.add(childGenotype.getKey());
}
else if(MathUtils.compareDoubles(configurationLikelihood, bestConfigurationLikelihood) == 0) {
else if(configurationLikelihood == bestConfigurationLikelihood) {
bestFirstParentGenotype.add(firstParentGenotype.getKey());
bestSecondParentGenotype.add(secondParentGenotype.getKey());
bestChildGenotype.add(childGenotype.getKey());

View File

@ -118,6 +118,7 @@ public final class BCF2Codec implements FeatureCodec<VariantContext> {
final int sitesBlockSize = decoder.readBlockSize(inputStream);
final int genotypeBlockSize = decoder.readBlockSize(inputStream);
decoder.readNextBlock(sitesBlockSize, inputStream);
decodeSiteLoc(builder);
final SitesInfoForDecoding info = decodeSitesExtendedInfo(builder);

View File

@ -82,7 +82,7 @@ public final class BCF2Decoder {
public void skipNextBlock(final int blockSizeInBytes, final InputStream stream) {
try {
final int bytesRead = (int)stream.skip(blockSizeInBytes);
validateReadBytes(bytesRead, blockSizeInBytes);
validateReadBytes(bytesRead, 1, blockSizeInBytes);
} catch ( IOException e ) {
throw new UserException.CouldNotReadInputFile("I/O error while reading BCF2 file", e);
}
@ -316,17 +316,37 @@ public final class BCF2Decoder {
}
/**
* Read all bytes for a BCF record block into a byte[], and return it
*
* @param inputStream
* @return
* Is smart about reading from the stream multiple times to fill the buffer, if necessary
*
* @param blockSizeInBytes number of bytes to read
* @param inputStream the stream to read from
* @return a non-null byte[] containing exactly blockSizeInBytes bytes from the inputStream
*/
private final static byte[] readRecordBytes(final int blockSizeInBytes, final InputStream inputStream) {
@Requires({"blockSizeInBytes >= 0", "inputStream != null"})
@Ensures("result != null")
private static byte[] readRecordBytes(final int blockSizeInBytes, final InputStream inputStream) {
assert blockSizeInBytes >= 0;
final byte[] record = new byte[blockSizeInBytes];
try {
final int bytesRead = inputStream.read(record);
validateReadBytes(bytesRead, blockSizeInBytes);
int bytesRead = 0;
int nReadAttempts = 0; // keep track of how many times we've read
// because we might not read enough bytes from the file in a single go, do it in a loop until we get EOF
while ( bytesRead < blockSizeInBytes ) {
final int read1 = inputStream.read(record, bytesRead, blockSizeInBytes - bytesRead);
if ( read1 == -1 )
validateReadBytes(bytesRead, nReadAttempts, blockSizeInBytes);
else
bytesRead += read1;
}
if ( nReadAttempts > 1 ) // TODO -- remove me
logger.warn("Required multiple read attempts to actually get the entire BCF2 block, unexpected behavior");
validateReadBytes(bytesRead, nReadAttempts, blockSizeInBytes);
} catch ( IOException e ) {
throw new UserException.CouldNotReadInputFile("I/O error while reading BCF2 file", e);
}
@ -334,14 +354,20 @@ public final class BCF2Decoder {
return record;
}
private final static void validateReadBytes(final int actuallyRead, final int expected) {
/**
* Make sure we read the right number of bytes, or throw an error
*
* @param actuallyRead
* @param nReadAttempts
* @param expected
*/
private static void validateReadBytes(final int actuallyRead, final int nReadAttempts, final int expected) {
assert expected >= 0;
if ( actuallyRead < expected ) {
throw new UserException.MalformedBCF2(String.format("Failed to read next complete record: %s",
actuallyRead == -1 ?
"premature end of input stream" :
String.format("expected %d bytes but read only %d", expected, actuallyRead)));
throw new UserException.MalformedBCF2(
String.format("Failed to read next complete record: expected %d bytes but read only %d after %d iterations",
expected, actuallyRead, nReadAttempts));
}
}

View File

@ -340,6 +340,17 @@ public class UserException extends ReviewedStingException {
}
}
public static class CouldNotCreateReferenceFAIorDictForGzippedRef extends UserException {
public CouldNotCreateReferenceFAIorDictForGzippedRef(final File f) {
super("Although the GATK can process .gz reference sequences, it currently cannot create FAI " +
"or DICT files for them. In order to use the GATK with reference.fasta.gz you will need to " +
"create .dict and .fai files for reference.fasta.gz and name them reference.fasta.gz.fai and " +
"reference.dict. Potentially the easiest way to do this is to uncompress reference.fasta, " +
"run the GATK to create the .dict and .fai files, and copy them to the appropriate location. " +
"Sorry for the inconvenience.");
}
}
public static class CouldNotCreateReferenceIndexFileBecauseOfLock extends UserException.CouldNotCreateReferenceIndexFile {
public CouldNotCreateReferenceIndexFileBecauseOfLock(File f) {
super(f, "could not be written because an exclusive file lock could not be obtained. " +

View File

@ -41,6 +41,8 @@ import java.util.Arrays;
* Thread-safe! Uses a lock object to protect write and access to the cache.
*/
public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
protected static final org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger(CachingIndexedFastaSequenceFile.class);
/** global enable flag */
private static final boolean USE_CACHE = true;
@ -125,7 +127,7 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
public void printEfficiency() {
// comment out to disable tracking
if ( (cacheHits + cacheMisses) % PRINT_FREQUENCY == 0 ) {
System.out.printf("### CachingIndexedFastaReader: hits=%d misses=%d efficiency %.6f%%%n", cacheHits, cacheMisses, calcEfficiency());
logger.info(String.format("### CachingIndexedFastaReader: hits=%d misses=%d efficiency %.6f%%%n", cacheHits, cacheMisses, calcEfficiency()));
}
}

View File

@ -1,11 +1,14 @@
package org.broadinstitute.sting.gatk.filters;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import net.sf.samtools.Cigar;
import org.broadinstitute.sting.utils.clipping.ReadClipperTestUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
import java.util.List;
/**
* Checks that the Bad Cigar filter works for all kinds of wonky cigars
*
@ -14,6 +17,29 @@ import org.testng.annotations.Test;
*/
public class BadCigarFilterUnitTest {
public static final String[] BAD_CIGAR_LIST = {
"2D4M", // starting with multiple deletions
"4M2D", // ending with multiple deletions
"3M1I1D", // adjacent indels AND ends in deletion
"1M1I1D2M", // adjacent indels I->D
"1M1D2I1M", // adjacent indels D->I
"1M1I2M1D", // ends in single deletion with insertion in the middle
"4M1D", // ends in single deletion
"1D4M", // starts with single deletion
"2M1D1D2M", // adjacent D's
"1M1I1I1M", // adjacent I's
"1H1D4M", // starting with deletion after H
"1S1D3M", // starting with deletion after S
"1H1S1D3M", // starting with deletion after HS
"4M1D1H", // ending with deletion before H
"3M1D1S", // ending with deletion before S
"3M1D1S1H", // ending with deletion before HS
"10M2H10M", // H in the middle
"10M2S10M", // S in the middle
"1H1S10M2S10M1S1H", // deceiving S in the middle
"1H1S10M2H10M1S1H" // deceiving H in the middle
};
BadCigarFilter filter;
@BeforeClass
@ -21,40 +47,20 @@ public class BadCigarFilterUnitTest {
filter = new BadCigarFilter();
}
@Test
@Test(enabled = true)
public void testWonkyCigars () {
byte[] bases = {'A', 'A', 'A', 'A'};
byte[] quals = {30, 30, 30, 30};
GATKSAMRecord read;
// starting with multiple deletions
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "2D4M");
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
for (String cigarString : BAD_CIGAR_LIST) {
GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigarString);
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
}
}
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "4M2D"); // ending with multiple deletions
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "3M1I1D"); // adjacent indels AND ends in deletion
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1I1D2M"); // adjacent indels I->D
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1D2I1M"); // adjacent indels D->I
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1I2M1D"); // ends in single deletion with insertion in the middle
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "4M1D"); // ends in single deletion
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1D4M"); // starts with single deletion
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "2M1D1D2M"); // adjacent D's
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1I1I1M"); // adjacent I's
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
@Test(enabled = true)
public void testGoodCigars() {
List<Cigar> cigarList = ReadClipperTestUtils.generateCigarList(10);
for (Cigar cigar : cigarList) {
GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar);
Assert.assertFalse(filter.filterOut(read), read.getCigarString());
}
}
}

View File

@ -4,6 +4,7 @@ import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.testng.Assert;
@ -37,17 +38,22 @@ public class ReadClipperTestUtils {
return ArtificialSAMUtils.createArtificialRead(Utils.arrayFromArrayWithLength(BASES, cigar.getReadLength()), Utils.arrayFromArrayWithLength(QUALS, cigar.getReadLength()), cigar.toString());
}
/**
* This function generates every valid permutation of cigar strings with a given length.
*
* A valid cigar object obeys the following rules:
* - No Hard/Soft clips in the middle of the read
* - No deletions in the beginning / end of the read
* - No repeated adjacent element (e.g. 1M2M -> this should be 3M)
*
* @param maximumLength the maximum number of elements in the cigar
* @return a list with all valid Cigar objects
*/
public static GATKSAMRecord makeReadFromCigar(String cigarString) {
return makeReadFromCigar(cigarFromString(cigarString));
}
/**
* This function generates every valid permutation of cigar strings with a given length.
*
* A valid cigar object obeys the following rules:
* - No Hard/Soft clips in the middle of the read
* - No deletions in the beginning / end of the read
* - No repeated adjacent element (e.g. 1M2M -> this should be 3M)
* - No consecutive I/D elements
*
* @param maximumLength the maximum number of elements in the cigar
* @return a list with all valid Cigar objects
*/
public static List<Cigar> generateCigarList(int maximumLength) {
int numCigarElements = cigarElements.length;
LinkedList<Cigar> cigarList = new LinkedList<Cigar>();
@ -137,7 +143,10 @@ public class ReadClipperTestUtils {
CigarElement lastElement = null;
int lastElementLength = 0;
for (CigarElement cigarElement : rawCigar.getCigarElements()) {
if (lastElement != null && lastElement.getOperator() == cigarElement.getOperator())
if (lastElement != null &&
((lastElement.getOperator() == cigarElement.getOperator()) ||
(lastElement.getOperator() == CigarOperator.I && cigarElement.getOperator() == CigarOperator.D) ||
(lastElement.getOperator() == CigarOperator.D && cigarElement.getOperator() == CigarOperator.I)))
lastElementLength += cigarElement.getLength();
else
{
@ -191,7 +200,7 @@ public class ReadClipperTestUtils {
/**
* Checks whether or not the read has any cigar element that is not H or S
*
* @param read
* @param read the read
* @return true if it has any M, I or D, false otherwise
*/
public static boolean readHasNonClippedBases(GATKSAMRecord read) {
@ -201,5 +210,79 @@ public class ReadClipperTestUtils {
return false;
}
public static Cigar cigarFromString(String cigarString) {
Cigar cigar = new Cigar();
boolean isNumber = false;
int number = 0;
for (int i = 0; i < cigarString.length(); i++) {
char x = cigarString.charAt(i);
if (x >= '0' && x <='9') {
if (isNumber) {
number *= 10;
}
else {
isNumber = true;
}
number += x - '0';
}
else {
CigarElement e;
switch (x) {
case 'M':
case 'm':
e = new CigarElement(number, CigarOperator.M);
break;
case 'I':
case 'i':
e = new CigarElement(number, CigarOperator.I);
break;
case 'D':
case 'd':
e = new CigarElement(number, CigarOperator.D);
break;
case 'S':
case 's':
e = new CigarElement(number, CigarOperator.S);
break;
case 'N':
case 'n':
e = new CigarElement(number, CigarOperator.N);
break;
case 'H':
case 'h':
e = new CigarElement(number, CigarOperator.H);
break;
case 'P':
case 'p':
e = new CigarElement(number, CigarOperator.P);
break;
case '=':
e = new CigarElement(number, CigarOperator.EQ);
break;
case 'X':
case 'x':
e = new CigarElement(number, CigarOperator.X);
break;
default:
throw new ReviewedStingException("Unrecognized cigar operator: " + x + " (number: " + number + ")");
}
cigar.add(e);
}
}
return cigar;
}
}

View File

@ -35,7 +35,7 @@ import org.broadinstitute.sting.queue.engine.{RunnerStatus, CommandLineJobRunner
import java.util.regex.Pattern
import java.lang.StringBuffer
import java.util.Date
import com.sun.jna.{Structure, StringArray, NativeLong}
import com.sun.jna.{Pointer, Structure, StringArray, NativeLong}
import com.sun.jna.ptr.IntByReference
/**
@ -295,9 +295,17 @@ object Lsf706JobRunner extends Logging {
// the platform LSF startTimes are in seconds, not milliseconds, so convert to the java convention
runner.getRunInfo.startTime = new Date(jobInfo.startTime.longValue * 1000)
runner.getRunInfo.doneTime = new Date(jobInfo.endTime.longValue * 1000)
val exHostsRaw = jobInfo.exHosts.getStringArray(0)
//logger.warn("exHostsRaw = " + exHostsRaw)
val exHostsList = exHostsRaw.toSeq
val exHostsList =
if (jobInfo.numExHosts != 1) {
// this is necessary because
val exHostsString = "multipleHosts_" + jobInfo.numExHosts
logger.debug("numExHosts = " + jobInfo.numExHosts + " != 1 for job " + runner.jobId + ", cannot safely get exhosts, setting to " + exHostsString)
List(exHostsString)
} else {
jobInfo.exHosts.getStringArray(0).toSeq
}
//logger.warn("exHostsList = " + exHostsList)
val exHosts = exHostsList.reduceLeft(_ + "," + _)
//logger.warn("exHosts = " + exHosts)