diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..744a93f45 --- /dev/null +++ b/.gitignore @@ -0,0 +1,18 @@ +/*.bam +/*.bai +/*.bed +*.idx +*~ +/*.vcf +/*.txt +/*.csh +/.* +/*.pdf +/*.eval +*.ipr +*.iws +queueScatterGather +/foo* +/bar* +integrationtests/ +public/testdata/onTheFlyOutputTest.vcf \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index 74d39ecb0..8452aadfd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -43,6 +43,7 @@ import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.baq.BAQSamIterator; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.sam.GATKSamRecordFactory; import java.io.File; import java.lang.reflect.InvocationTargetException; @@ -57,6 +58,8 @@ import java.util.*; * Converts shards to SAM iterators over the specified region */ public class SAMDataSource { + final private static GATKSamRecordFactory factory = new GATKSamRecordFactory(); + /** Backing support for reads. */ protected final ReadProperties readProperties; @@ -644,7 +647,9 @@ public class SAMDataSource { BAQ.QualityMode qmode, IndexedFastaSequenceFile refReader, byte defaultBaseQualities) { - wrappedIterator = new ReadFormattingIterator(wrappedIterator, useOriginalBaseQualities, defaultBaseQualities); + if ( useOriginalBaseQualities || defaultBaseQualities >= 0 ) + // only wrap if we are replacing the original qualitiies or using a default base quality + wrappedIterator = new ReadFormattingIterator(wrappedIterator, useOriginalBaseQualities, defaultBaseQualities); // NOTE: this (and other filtering) should be done before on-the-fly sorting // as there is no reason to sort something that we will end of throwing away @@ -756,6 +761,7 @@ public class SAMDataSource { public SAMReaders(Collection readerIDs, SAMFileReader.ValidationStringency validationStringency) { for(SAMReaderID readerID: readerIDs) { SAMFileReader reader = new SAMFileReader(readerID.samFile); + reader.setSAMRecordFactory(factory); reader.enableFileSource(true); reader.enableIndexMemoryMapping(false); if(!enableLowMemorySharding) diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadFormattingIterator.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadFormattingIterator.java index 2f30d12a8..9a89d2086 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadFormattingIterator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadFormattingIterator.java @@ -2,7 +2,6 @@ package org.broadinstitute.sting.gatk.iterators; import net.sf.samtools.SAMRecord; import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; /** * An iterator which does post-processing of a read, including potentially wrapping @@ -78,7 +77,30 @@ public class ReadFormattingIterator implements StingSAMIterator { * no next exists. */ public SAMRecord next() { - return new GATKSAMRecord(wrappedIterator.next(), useOriginalBaseQualities, defaultBaseQualities); + SAMRecord rec = wrappedIterator.next(); + + // if we are using default quals, check if we need them, and add if necessary. + // 1. we need if reads are lacking or have incomplete quality scores + // 2. we add if defaultBaseQualities has a positive value + if (defaultBaseQualities >= 0) { + byte reads [] = rec.getReadBases(); + byte quals [] = rec.getBaseQualities(); + if (quals == null || quals.length < reads.length) { + byte new_quals [] = new byte [reads.length]; + for (int i=0; i 1 when the read is a consensus read representing multiple independent observations - final boolean isReduced = ReadUtils.isReducedRead(p.getRead()); + final boolean isReduced = p.isReducedRead(); readCounts[readIdx] = isReduced ? p.getReducedCount() : 1; // check if we've already computed likelihoods for this pileup element (i.e. for this read at this location) @@ -420,10 +420,6 @@ public class PairHMMIndelErrorModel { if (read == null) continue; - if ( isReduced ) { - read = ReadUtils.reducedReadWithReducedQuals(read); - } - if(ReadUtils.is454Read(read)) { continue; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java index e117454f9..e10334a77 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java @@ -2,9 +2,12 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.NGSPlatform; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.Arrays; +import java.util.EnumSet; import java.util.List; /* @@ -46,6 +49,9 @@ import java.util.List; */ public class CycleCovariate implements StandardCovariate { + private final static EnumSet DISCRETE_CYCLE_PLATFORMS = EnumSet.of(NGSPlatform.ILLUMINA, NGSPlatform.SOLID, NGSPlatform.PACBIO, NGSPlatform.COMPLETE_GENOMICS); + private final static EnumSet FLOW_CYCLE_PLATFORMS = EnumSet.of(NGSPlatform.LS454, NGSPlatform.ION_TORRENT); + // Initialize any member variables using the command-line arguments passed to the walkers public void initialize( final RecalibrationArgumentCollection RAC ) { if( RAC.DEFAULT_PLATFORM != null ) { @@ -58,122 +64,6 @@ public class CycleCovariate implements StandardCovariate { } } - /* - // Used to pick out the covariate's value from attributes of the read - public final Comparable getValue( final SAMRecord read, final int offset ) { - - int cycle = 1; - - //----------------------------- - // ILLUMINA and SOLID - //----------------------------- - - if( read.getReadGroup().getPlatform().equalsIgnoreCase( "ILLUMINA" ) || read.getReadGroup().getPlatform().equalsIgnoreCase( "SLX" ) || // Some bams have "illumina" and others have "SLX" - read.getReadGroup().getPlatform().equalsIgnoreCase( "SOLID" ) || read.getReadGroup().getPlatform().equalsIgnoreCase( "ABI_SOLID" )) { // Some bams have "solid" and others have "ABI_SOLID" - cycle = offset + 1; - if( read.getReadNegativeStrandFlag() ) { - cycle = read.getReadLength() - offset; - } - } - - //----------------------------- - // 454 - //----------------------------- - - else if( read.getReadGroup().getPlatform().contains( "454" ) ) { // Some bams have "LS454" and others have just "454" - final byte[] bases = read.getReadBases(); - - // BUGBUG: Consider looking at degradation of base quality scores in homopolymer runs to detect when the cycle incremented even though the nucleotide didn't change - // For example, AAAAAAA was probably read in two flow cycles but here we count it as one - if( !read.getReadNegativeStrandFlag() ) { // Forward direction - int iii = 0; - while( iii <= offset ) - { - while( iii <= offset && bases[iii] == (byte)'T' ) { iii++; } - while( iii <= offset && bases[iii] == (byte)'A' ) { iii++; } - while( iii <= offset && bases[iii] == (byte)'C' ) { iii++; } - while( iii <= offset && bases[iii] == (byte)'G' ) { iii++; } - if( iii <= offset ) { cycle++; } - if( iii <= offset && !BaseUtils.isRegularBase(bases[iii]) ) { iii++; } - - } - } else { // Negative direction - int iii = bases.length-1; - while( iii >= offset ) - { - while( iii >= offset && bases[iii] == (byte)'T' ) { iii--; } - while( iii >= offset && bases[iii] == (byte)'A' ) { iii--; } - while( iii >= offset && bases[iii] == (byte)'C' ) { iii--; } - while( iii >= offset && bases[iii] == (byte)'G' ) { iii--; } - if( iii >= offset ) { cycle++; } - if( iii >= offset && !BaseUtils.isRegularBase(bases[iii]) ) { iii--; } - } - } - } - - //----------------------------- - // SOLID (unused), only to be used in conjunction with PrimerRoundCovariate - //----------------------------- - - //else if( read.getReadGroup().getPlatform().equalsIgnoreCase( "SOLID" ) ) { - // // The ligation cycle according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf - // int pos = offset + 1; - // if( read.getReadNegativeStrandFlag() ) { - // pos = read.getReadLength() - offset; - // } - // cycle = pos / 5; // integer division - //} - - //----------------------------- - // UNRECOGNIZED PLATFORM - //----------------------------- - - else { // Platform is unrecognized so revert to the default platform but warn the user first - if( defaultPlatform != null) { // The user set a default platform - if( !warnedUserBadPlatform ) { - Utils.warnUser( "Platform string (" + read.getReadGroup().getPlatform() + ") unrecognized in CycleCovariate. " + - "Defaulting to platform = " + defaultPlatform + "." ); - } - warnedUserBadPlatform = true; - - read.getReadGroup().setPlatform( defaultPlatform ); - return getValue( read, offset ); // A recursive call - } else { // The user did not set a default platform - throw new StingException( "Platform string (" + read.getReadGroup().getPlatform() + ") unrecognized in CycleCovariate. " + - "No default platform specified. Users must set the default platform using the --default_platform argument." ); - } - } - - // Differentiate between first and second of pair. - // The sequencing machine cycle keeps incrementing for the second read in a pair. So it is possible for a read group - // to have an error affecting quality at a particular cycle on the first of pair which carries over to the second of pair. - // Therefore the cycle covariate must differentiate between first and second of pair reads. - // This effect can not be corrected by pulling out the first of pair and second of pair flags into a separate covariate because - // the current sequential model would consider the effects independently instead of jointly. - if( read.getReadPairedFlag() && read.getSecondOfPairFlag() ) { - cycle *= -1; - } - - return cycle; - } - */ - - // todo -- this should be put into a common place in the code base - private static List ILLUMINA_NAMES = Arrays.asList("ILLUMINA", "SLX", "SOLEXA"); - private static List SOLID_NAMES = Arrays.asList("SOLID"); - private static List LS454_NAMES = Arrays.asList("454"); - private static List COMPLETE_GENOMICS_NAMES = Arrays.asList("COMPLETE"); - private static List PACBIO_NAMES = Arrays.asList("PACBIO"); - private static List ION_TORRENT_NAMES = Arrays.asList("IONTORRENT"); - - private static boolean isPlatform(SAMRecord read, List names) { - String pl = read.getReadGroup().getPlatform().toUpperCase(); - for ( String name : names ) - if ( pl.contains( name ) ) - return true; - return false; - } - // Used to pick out the covariate's value from attributes of the read public void getValues(SAMRecord read, Comparable[] comparable) { @@ -181,7 +71,8 @@ public class CycleCovariate implements StandardCovariate { // Illumina, Solid, PacBio, and Complete Genomics //----------------------------- - if( isPlatform(read, ILLUMINA_NAMES) || isPlatform(read, SOLID_NAMES) || isPlatform(read, PACBIO_NAMES) || isPlatform(read, COMPLETE_GENOMICS_NAMES) ) { + final NGSPlatform ngsPlatform = ((GATKSAMRecord)read).getNGSPlatform(); + if( DISCRETE_CYCLE_PLATFORMS.contains(ngsPlatform) ) { final int init; final int increment; if( !read.getReadNegativeStrandFlag() ) { @@ -227,8 +118,7 @@ public class CycleCovariate implements StandardCovariate { //----------------------------- // 454 and Ion Torrent //----------------------------- - - else if ( isPlatform(read, LS454_NAMES) || isPlatform(read, ION_TORRENT_NAMES)) { // Some bams have "LS454" and others have just "454" + else if( FLOW_CYCLE_PLATFORMS.contains(ngsPlatform) ) { final int readLength = read.getReadLength(); final byte[] bases = read.getReadBases(); @@ -273,8 +163,6 @@ public class CycleCovariate implements StandardCovariate { else { throw new IllegalStateException("This method hasn't been implemented yet for " + read.getReadGroup().getPlatform()); } - - } // Used to get the covariate's value from input csv file in TableRecalibrationWalker diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java index 2daa8c025..a0c928afa 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java @@ -35,6 +35,7 @@ import org.broadinstitute.sting.utils.collections.NestedHashMap; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.ArrayList; @@ -228,8 +229,7 @@ public class RecalDataManager { * @param RAC The list of shared command line arguments */ public static void parseSAMRecord( final SAMRecord read, final RecalibrationArgumentCollection RAC ) { - - SAMReadGroupRecord readGroup = read.getReadGroup(); + GATKSAMReadGroupRecord readGroup = ((GATKSAMRecord)read).getReadGroup(); // If there are no read groups we have to default to something, and that something could be specified by the user using command line arguments if( readGroup == null ) { @@ -241,7 +241,7 @@ public class RecalDataManager { warnUserNullReadGroup = true; } // There is no readGroup so defaulting to these values - readGroup = new SAMReadGroupRecord( RAC.DEFAULT_READ_GROUP ); + readGroup = new GATKSAMReadGroupRecord( RAC.DEFAULT_READ_GROUP ); readGroup.setPlatform( RAC.DEFAULT_PLATFORM ); ((GATKSAMRecord)read).setReadGroup( readGroup ); } else { @@ -251,7 +251,7 @@ public class RecalDataManager { if( RAC.FORCE_READ_GROUP != null && !readGroup.getReadGroupId().equals(RAC.FORCE_READ_GROUP) ) { // Collapse all the read groups into a single common String provided by the user final String oldPlatform = readGroup.getPlatform(); - readGroup = new SAMReadGroupRecord( RAC.FORCE_READ_GROUP ); + readGroup = new GATKSAMReadGroupRecord( RAC.FORCE_READ_GROUP ); readGroup.setPlatform( oldPlatform ); ((GATKSAMRecord)read).setReadGroup( readGroup ); } diff --git a/public/java/src/org/broadinstitute/sting/utils/NGSPlatform.java b/public/java/src/org/broadinstitute/sting/utils/NGSPlatform.java new file mode 100644 index 000000000..4f01f2b7a --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/NGSPlatform.java @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils; + +import net.sf.samtools.SAMReadGroupRecord; +import net.sf.samtools.SAMRecord; + +/** + * A canonical, master list of the standard NGS platforms. These values + * can be obtained (efficiently) from a GATKSAMRecord object with the + * getNGSPlatform method. + * + * @author Mark DePristo + * @since 2011 + */ +public enum NGSPlatform { + ILLUMINA("ILLUMINA", "SLX", "SOLEXA"), + SOLID("SOLID"), + LS454("454"), + COMPLETE_GENOMICS("COMPLETE"), + PACBIO("PACBIO"), + ION_TORRENT("IONTORRENT"), + UNKNOWN("UNKNOWN"); + + /** + * Array of the prefix names in a BAM file for each of the platforms. + */ + private final String[] BAM_PL_NAMES; + + NGSPlatform(final String... BAM_PL_NAMES) { + for ( int i = 0; i < BAM_PL_NAMES.length; i++ ) + BAM_PL_NAMES[i] = BAM_PL_NAMES[i].toUpperCase(); + this.BAM_PL_NAMES = BAM_PL_NAMES; + } + + /** + * Returns a representative PL string for this platform + * @return + */ + public final String getDefaultPlatform() { + return BAM_PL_NAMES[0]; + } + + /** + * Convenience constructor -- calculates the NGSPlatfrom from a SAMRecord. + * Note you should not use this function if you have a GATKSAMRecord -- use the + * accessor method instead. + * + * @param read + * @return an NGSPlatform object matching the PL field of the header, of UNKNOWN if there was no match + */ + public static final NGSPlatform fromRead(SAMRecord read) { + return fromReadGroup(read.getReadGroup()); + } + + /** + * Returns the NGSPlatform corresponding to the PL tag in the read group + * @param rg + * @return an NGSPlatform object matching the PL field of the header, of UNKNOWN if there was no match + */ + public static final NGSPlatform fromReadGroup(SAMReadGroupRecord rg) { + return fromReadGroupPL(rg.getPlatform()); + } + + /** + * Returns the NGSPlatform corresponding to the PL tag in the read group + * @param plFromRG -- the PL field (or equivalent) in a ReadGroup object + * @return an NGSPlatform object matching the PL field of the header, of UNKNOWN if there was no match + */ + public static final NGSPlatform fromReadGroupPL(final String plFromRG) { + if ( plFromRG == null ) return UNKNOWN; + + // todo -- algorithm could be implemented more efficiently, as the list of all + // todo -- names is known upfront, so a decision tree could be used to identify + // todo -- a prefix common to PL + final String pl = plFromRG.toUpperCase(); + for ( final NGSPlatform ngsPlatform : NGSPlatform.values() ) { + for ( final String bamPLName : ngsPlatform.BAM_PL_NAMES ) { + if ( pl.contains(bamPLName) ) + return ngsPlatform; + } + } + + return UNKNOWN; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/FragmentPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/FragmentPileup.java index f7d237401..4eda7c7cd 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/FragmentPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/FragmentPileup.java @@ -1,9 +1,10 @@ package org.broadinstitute.sting.utils.pileup; -import java.util.ArrayList; -import java.util.Collection; -import java.util.HashMap; -import java.util.Map; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; + +import java.util.*; /** * An easy to access fragment-based pileup, which contains two separate pileups. The first @@ -13,31 +14,51 @@ import java.util.Map; * * Based on the original code by E. Banks * - * TODO -- technically we could generalize this code to support a pseudo-duplicate marking - * TODO -- algorithm that could collect all duplicates into a single super pileup element + * Oct 21: note that the order of the oneReadPileup and twoReadPileups are not + * defined. The algorithms that produce these lists are in fact producing + * lists of Pileup elements *NOT* sorted by alignment start position of the underlying + * reads. * * User: depristo * Date: 3/26/11 * Time: 10:09 PM */ public class FragmentPileup { - final Collection oneReadPile; - final Collection twoReadPile = new ArrayList(); + Collection oneReadPile = null; + Collection twoReadPile = null; + + protected enum FragmentMatchingAlgorithm { + ORIGINAL, + skipNonOverlapping, + } /** * Create a new Fragment-based pileup from the standard read-based pileup * @param pileup */ public FragmentPileup(ReadBackedPileup pileup) { - Map nameMap = new HashMap(); + skipNonOverlapping(pileup); + } + + /** For performance testing only */ + protected FragmentPileup(ReadBackedPileup pileup, FragmentMatchingAlgorithm algorithm) { + switch ( algorithm ) { + case ORIGINAL: oldSlowCalculation(pileup); break; + case skipNonOverlapping: skipNonOverlapping(pileup); break; + } + } + + private final void oldSlowCalculation(final ReadBackedPileup pileup) { + final Map nameMap = new HashMap(pileup.size()); // build an initial map, grabbing all of the multi-read fragments - for ( PileupElement p : pileup ) { - String readName = p.getRead().getReadName(); + for ( final PileupElement p : pileup ) { + final String readName = p.getRead().getReadName(); - PileupElement pe1 = nameMap.get(readName); + final PileupElement pe1 = nameMap.get(readName); if ( pe1 != null ) { // assumes we have at most 2 reads per fragment + if ( twoReadPile == null ) twoReadPile = new ArrayList(); twoReadPile.add(new TwoReadPileupElement(pe1, p)); nameMap.remove(readName); } else { @@ -45,17 +66,54 @@ public class FragmentPileup { } } - // now set the one Read pile to the values in the nameMap with only a single read oneReadPile = nameMap.values(); } + private final void skipNonOverlapping(final ReadBackedPileup pileup) { + Map nameMap = null; + + // build an initial map, grabbing all of the multi-read fragments + for ( final PileupElement p : pileup ) { + final SAMRecord read = p.getRead(); + final int mateStart = read.getMateAlignmentStart(); + + if ( mateStart == 0 || mateStart > read.getAlignmentEnd() ) { + // if we know that this read won't overlap its mate, or doesn't have one, jump out early + if ( oneReadPile == null ) oneReadPile = new ArrayList(pileup.size()); // lazy init + oneReadPile.add(p); + } else { + // the read might overlap it's mate, or is the rightmost read of a pair + final String readName = p.getRead().getReadName(); + final PileupElement pe1 = nameMap == null ? null : nameMap.get(readName); + if ( pe1 != null ) { + // assumes we have at most 2 reads per fragment + if ( twoReadPile == null ) twoReadPile = new ArrayList(); // lazy init + twoReadPile.add(new TwoReadPileupElement(pe1, p)); + nameMap.remove(readName); + } else { + if ( nameMap == null ) nameMap = new HashMap(pileup.size()); // lazy init + nameMap.put(readName, p); + } + } + } + + // add all of the reads that are potentially overlapping but whose mate never showed + // up to the oneReadPile + if ( nameMap != null && ! nameMap.isEmpty() ) { + if ( oneReadPile == null ) + oneReadPile = nameMap.values(); + else + oneReadPile.addAll(nameMap.values()); + } + } + /** * Gets the pileup elements containing two reads, in no particular order * * @return */ public Collection getTwoReadPileup() { - return twoReadPile; + return twoReadPile == null ? Collections.emptyList() : twoReadPile; } /** @@ -64,7 +122,7 @@ public class FragmentPileup { * @return */ public Collection getOneReadPileup() { - return oneReadPile; + return oneReadPile == null ? Collections.emptyList() : oneReadPile; } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java index f6ed792a5..3d6b6f4b9 100755 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java @@ -4,6 +4,7 @@ import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; /** @@ -12,7 +13,7 @@ import org.broadinstitute.sting.utils.sam.ReadUtils; * Date: Apr 14, 2009 * Time: 8:54:05 AM */ -public class PileupElement { +public class PileupElement implements Comparable { public static final byte DELETION_BASE = BaseUtils.D; public static final byte DELETION_QUAL = (byte) 16; public static final byte A_FOLLOWED_BY_INSERTION_BASE = (byte) 87; @@ -75,6 +76,20 @@ public class PileupElement { return isDeletion() ? DELETION_QUAL : read.getBaseQualities()[offset]; } + @Override + public int compareTo(final PileupElement pileupElement) { + if ( offset < pileupElement.offset ) + return -1; + else if ( offset > pileupElement.offset ) + return 1; + else if ( read.getAlignmentStart() < pileupElement.read.getAlignmentStart() ) + return -1; + else if ( read.getAlignmentStart() > pileupElement.read.getAlignmentStart() ) + return 1; + else + return 0; + } + // -------------------------------------------------------------------------- // // Reduced read accessors @@ -82,16 +97,16 @@ public class PileupElement { // -------------------------------------------------------------------------- public boolean isReducedRead() { - return ReadUtils.isReducedRead(getRead()); + return ((GATKSAMRecord)read).isReducedRead(); } public int getReducedCount() { if ( ! isReducedRead() ) throw new IllegalArgumentException("Cannot get reduced count for non-reduced read " + getRead().getReadName()); - return ReadUtils.getReducedCount(getRead(), offset); + return ((GATKSAMRecord)read).getReducedCount(offset); } public byte getReducedQual() { if ( ! isReducedRead() ) throw new IllegalArgumentException("Cannot get reduced qual for non-reduced read " + getRead().getReadName()); - return ReadUtils.getReducedQual(getRead(), offset); + return getQual(); } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java index 2dcdd5ce6..1b3641128 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java @@ -2,11 +2,15 @@ package org.broadinstitute.sting.utils.sam; import net.sf.samtools.*; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; import java.io.File; -import java.util.ArrayList; -import java.util.List; +import java.util.*; /** * @author aaron @@ -29,7 +33,7 @@ public class ArtificialSAMUtils { File outFile = new File(filename); SAMFileWriter out = new SAMFileWriterFactory().makeBAMWriter(header, true, outFile); - + for (int x = startingChromosome; x < startingChromosome + numberOfChromosomes; x++) { for (int readNumber = 1; readNumber < readsPerChomosome; readNumber++) { out.addAlignment(createArtificialRead(header, "Read_" + readNumber, x - startingChromosome, readNumber, DEFAULT_READ_LENGTH)); @@ -134,6 +138,7 @@ public class ArtificialSAMUtils { /** * Create an artificial read based on the parameters. The cigar string will be *M, where * is the length of the read * + * * @param header the SAM header to associate the read with * @param name the name of the read * @param refIndex the reference index, i.e. what chromosome to associate it with @@ -142,11 +147,11 @@ public class ArtificialSAMUtils { * * @return the artificial read */ - public static SAMRecord createArtificialRead( SAMFileHeader header, String name, int refIndex, int alignmentStart, int length ) { + public static GATKSAMRecord createArtificialRead(SAMFileHeader header, String name, int refIndex, int alignmentStart, int length) { if( (refIndex == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX && alignmentStart != SAMRecord.NO_ALIGNMENT_START) || - (refIndex != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX && alignmentStart == SAMRecord.NO_ALIGNMENT_START) ) + (refIndex != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX && alignmentStart == SAMRecord.NO_ALIGNMENT_START) ) throw new ReviewedStingException("Invalid alignment start for artificial read, start = " + alignmentStart); - SAMRecord record = new SAMRecord(header); + GATKSAMRecord record = new GATKSAMRecord(header); record.setReadName(name); record.setReferenceIndex(refIndex); record.setAlignmentStart(alignmentStart); @@ -166,6 +171,7 @@ public class ArtificialSAMUtils { if (refIndex == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { record.setReadUnmappedFlag(true); } + return record; } @@ -181,19 +187,51 @@ public class ArtificialSAMUtils { * * @return the artificial read */ - public static SAMRecord createArtificialRead( SAMFileHeader header, String name, int refIndex, int alignmentStart, byte[] bases, byte[] qual ) { + public static GATKSAMRecord createArtificialRead( SAMFileHeader header, String name, int refIndex, int alignmentStart, byte[] bases, byte[] qual ) { if (bases.length != qual.length) { throw new ReviewedStingException("Passed in read string is different length then the quality array"); } - SAMRecord rec = createArtificialRead(header, name, refIndex, alignmentStart, bases.length); + GATKSAMRecord rec = createArtificialRead(header, name, refIndex, alignmentStart, bases.length); rec.setReadBases(bases); rec.setBaseQualities(qual); if (refIndex == -1) { rec.setReadUnmappedFlag(true); } + return rec; } + public final static List createPair(SAMFileHeader header, String name, int readLen, int leftStart, int rightStart, boolean leftIsFirst, boolean leftIsNegative) { + SAMRecord left = ArtificialSAMUtils.createArtificialRead(header, name, 0, leftStart, readLen); + SAMRecord right = ArtificialSAMUtils.createArtificialRead(header, name, 0, rightStart, readLen); + + left.setReadPairedFlag(true); + right.setReadPairedFlag(true); + + left.setProperPairFlag(true); + right.setProperPairFlag(true); + + left.setFirstOfPairFlag(leftIsFirst); + right.setFirstOfPairFlag(! leftIsFirst); + + left.setReadNegativeStrandFlag(leftIsNegative); + left.setMateNegativeStrandFlag(!leftIsNegative); + right.setReadNegativeStrandFlag(!leftIsNegative); + right.setMateNegativeStrandFlag(leftIsNegative); + + left.setMateAlignmentStart(right.getAlignmentStart()); + right.setMateAlignmentStart(left.getAlignmentStart()); + + left.setMateReferenceIndex(0); + right.setMateReferenceIndex(0); + + int isize = rightStart + readLen - leftStart; + left.setInferredInsertSize(isize); + right.setInferredInsertSize(-isize); + + return Arrays.asList(left, right); + } + /** * create an iterator containing the specified read piles * @@ -255,4 +293,52 @@ public class ArtificialSAMUtils { return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, unmappedReadCount, header); } + + private final static int ranIntInclusive(Random ran, int start, int stop) { + final int range = stop - start; + return ran.nextInt(range) + start; + } + + /** + * Creates a read backed pileup containing up to pileupSize reads at refID 0 from header at loc with + * reads created that have readLen bases. Pairs are sampled from a gaussian distribution with mean insert + * size of insertSize and variation of insertSize / 10. The first read will be in the pileup, and the second + * may be, depending on where this sampled insertSize puts it. + * @param header + * @param loc + * @param readLen + * @param insertSize + * @param pileupSize + * @return + */ + public static ReadBackedPileup createReadBackedPileup(final SAMFileHeader header, final GenomeLoc loc, final int readLen, final int insertSize, final int pileupSize) { + final Random ran = new Random(); + final boolean leftIsFirst = true; + final boolean leftIsNegative = false; + final int insertSizeVariation = insertSize / 10; + final int pos = loc.getStart(); + + final List pileupElements = new ArrayList(); + for ( int i = 0; i < pileupSize / 2; i++ ) { + final String readName = "read" + i; + final int leftStart = ranIntInclusive(ran, 1, pos); + final int fragmentSize = (int)(ran.nextGaussian() * insertSizeVariation + insertSize); + final int rightStart = leftStart + fragmentSize - readLen; + + if ( rightStart <= 0 ) continue; + + List pair = createPair(header, readName, readLen, leftStart, rightStart, leftIsFirst, leftIsNegative); + final SAMRecord left = pair.get(0); + final SAMRecord right = pair.get(1); + + pileupElements.add(new PileupElement(left, pos - leftStart)); + + if ( pos >= right.getAlignmentStart() && pos <= right.getAlignmentEnd() ) { + pileupElements.add(new PileupElement(right, pos - rightStart)); + } + } + + Collections.sort(pileupElements); + return new ReadBackedPileupImpl(loc, pileupElements); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMReadGroupRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMReadGroupRecord.java index c7ffcab0c..ff7d12f09 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMReadGroupRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMReadGroupRecord.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.utils.sam; import net.sf.samtools.SAMReadGroupRecord; +import org.broadinstitute.sting.utils.NGSPlatform; /** * @author ebanks @@ -15,16 +16,28 @@ public class GATKSAMReadGroupRecord extends SAMReadGroupRecord { // the SAMReadGroupRecord data we're caching private String mSample = null; private String mPlatform = null; + private NGSPlatform mNGSPlatform = null; // because some values can be null, we don't want to duplicate effort private boolean retrievedSample = false; private boolean retrievedPlatform = false; + private boolean retrievedNGSPlatform = false; + public GATKSAMReadGroupRecord(final String id) { + super(id); + } public GATKSAMReadGroupRecord(SAMReadGroupRecord record) { super(record.getReadGroupId(), record); } + public GATKSAMReadGroupRecord(SAMReadGroupRecord record, NGSPlatform pl) { + super(record.getReadGroupId(), record); + setPlatform(pl.getDefaultPlatform()); + mNGSPlatform = pl; + retrievedPlatform = retrievedNGSPlatform = true; + } + /////////////////////////////////////////////////////////////////////////////// // *** The following methods are overloaded to cache the appropriate data ***// /////////////////////////////////////////////////////////////////////////////// @@ -55,5 +68,15 @@ public class GATKSAMReadGroupRecord extends SAMReadGroupRecord { super.setPlatform(s); mPlatform = s; retrievedPlatform = true; + retrievedNGSPlatform = false; // recalculate the NGSPlatform + } + + public NGSPlatform getNGSPlatform() { + if ( ! retrievedNGSPlatform ) { + mNGSPlatform = NGSPlatform.fromReadGroupPL(getPlatform()); + retrievedNGSPlatform = true; + } + + return mNGSPlatform; } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index e7c235cf7..d6c0b68b8 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -1,49 +1,56 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.utils.sam; import net.sf.samtools.*; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.NGSPlatform; -import java.lang.reflect.Method; import java.util.HashMap; -import java.util.List; import java.util.Map; /** - * @author ebanks + * @author ebanks, depristo * GATKSAMRecord * - * this class extends the samtools SAMRecord class and caches important + * this class extends the samtools BAMRecord class (and SAMRecord) and caches important * (and oft-accessed) data that's not already cached by the SAMRecord class * * IMPORTANT NOTE: Because ReadGroups are not set through the SAMRecord, * if they are ever modified externally then one must also invoke the * setReadGroup() method here to ensure that the cache is kept up-to-date. * - * 13 Oct 2010 - mhanna - this class is fundamentally flawed: it uses a decorator - * pattern to wrap a heavyweight object, which can lead - * to heinous side effects if the wrapping is not carefully - * done. Hopefully SAMRecord will become an interface and - * this will eventually be fixed. */ -public class GATKSAMRecord extends SAMRecord { - - // the underlying SAMRecord which we are wrapping - private final SAMRecord mRecord; - +public class GATKSAMRecord extends BAMRecord { // the SAMRecord data we're caching private String mReadString = null; - private SAMReadGroupRecord mReadGroup = null; - private boolean mNegativeStrandFlag; - private boolean mUnmappedFlag; - private Boolean mSecondOfPairFlag = null; + private GATKSAMReadGroupRecord mReadGroup = null; + private byte[] reducedReadCounts = null; // because some values can be null, we don't want to duplicate effort private boolean retrievedReadGroup = false; - - /** A private cache for the reduced read quality. Null indicates the value hasn't be fetched yet or isn't available */ - private boolean lookedUpReducedReadQuality = false; - private Integer reducedReadQuality; + private boolean retrievedReduceReadCounts = false; // These temporary attributes were added here to make life easier for // certain algorithms by providing a way to label or attach arbitrary data to @@ -51,101 +58,112 @@ public class GATKSAMRecord extends SAMRecord { // These attributes exist in memory only, and are never written to disk. private Map temporaryAttributes; - public GATKSAMRecord(SAMRecord record, boolean useOriginalBaseQualities, byte defaultBaseQualities) { - super(null); // it doesn't matter - this isn't used - if ( record == null ) - throw new IllegalArgumentException("The SAMRecord argument cannot be null"); - mRecord = record; + /** + * HACK TO CREATE GATKSAMRECORD WITH ONLY A HEADER FOR TESTING PURPOSES ONLY + * @param header + */ + public GATKSAMRecord(final SAMFileHeader header) { + this(new SAMRecord(header)); + } - mNegativeStrandFlag = mRecord.getReadNegativeStrandFlag(); - mUnmappedFlag = mRecord.getReadUnmappedFlag(); + /** + * HACK TO CREATE GATKSAMRECORD BASED ONLY A SAMRECORD FOR TESTING PURPOSES ONLY + * @param read + */ + public GATKSAMRecord(final SAMRecord read) { + super(read.getHeader(), read.getMateReferenceIndex(), + read.getAlignmentStart(), + read.getReadName() != null ? (short)read.getReadNameLength() : 0, + (short)read.getMappingQuality(), + 0, + read.getCigarLength(), + read.getFlags(), + read.getReadLength(), + read.getMateReferenceIndex(), + read.getMateAlignmentStart(), + read.getInferredInsertSize(), + new byte[]{}); + super.clearAttributes(); + } - // because attribute methods are declared to be final (and we can't overload them), - // we need to actually set all of the attributes here - List attributes = record.getAttributes(); - for ( SAMTagAndValue attribute : attributes ) - setAttribute(attribute.tag, attribute.value); - - // if we are using default quals, check if we need them, and add if necessary. - // 1. we need if reads are lacking or have incomplete quality scores - // 2. we add if defaultBaseQualities has a positive value - if (defaultBaseQualities >= 0) { - byte reads [] = record.getReadBases(); - byte quals [] = record.getBaseQualities(); - if (quals == null || quals.length < reads.length) { - byte new_quals [] = new byte [reads.length]; - for (int i=0; i getAttributes() { return mRecord.getAttributes(); } - - public SAMFileHeader getHeader() { return mRecord.getHeader(); } - - public void setHeader(SAMFileHeader samFileHeader) { mRecord.setHeader(samFileHeader); } - - public byte[] getVariableBinaryRepresentation() { return mRecord.getVariableBinaryRepresentation(); } - - public int getAttributesBinarySize() { return mRecord.getAttributesBinarySize(); } - - public String format() { return mRecord.format(); } - - public List getAlignmentBlocks() { return mRecord.getAlignmentBlocks(); } - - public List validateCigar(long l) { return mRecord.validateCigar(l); } - @Override public boolean equals(Object o) { if (this == o) return true; - // note -- this forbids a GATKSAMRecord being equal to its underlying SAMRecord if (!(o instanceof GATKSAMRecord)) return false; // note that we do not consider the GATKSAMRecord internal state at all - return mRecord.equals(((GATKSAMRecord)o).mRecord); - } - - public int hashCode() { return mRecord.hashCode(); } - - public List isValid() { return mRecord.isValid(); } - - public Object clone() throws CloneNotSupportedException { return mRecord.clone(); } - - public String toString() { return mRecord.toString(); } - - public SAMFileSource getFileSource() { return mRecord.getFileSource(); } - - /** - * Sets a marker providing the source reader for this file and the position in the file from which the read originated. - * @param fileSource source of the given file. - */ - @Override - protected void setFileSource(final SAMFileSource fileSource) { - try { - Method method = SAMRecord.class.getDeclaredMethod("setFileSource",SAMFileSource.class); - method.setAccessible(true); - method.invoke(mRecord,fileSource); - } - catch(Exception ex) { - throw new ReviewedStingException("Unable to invoke setFileSource method",ex); - } + return super.equals(o); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSamRecordFactory.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSamRecordFactory.java new file mode 100644 index 000000000..d96c874ea --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSamRecordFactory.java @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.sam; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMRecordFactory; +import net.sf.samtools.BAMRecord; +import org.broadinstitute.sting.utils.exceptions.UserException; + +/** + * Factory interface implementation used to create GATKSamRecords + * from SAMFileReaders with SAM-JDK + * + * @author Mark DePristo + */ +public class GATKSamRecordFactory implements SAMRecordFactory { + + /** Create a new SAMRecord to be filled in */ + public SAMRecord createSAMRecord(SAMFileHeader header) { + throw new UserException.BadInput("The GATK now longer supports input SAM files"); + } + + /** Create a new BAM Record. */ + public BAMRecord createBAMRecord(final SAMFileHeader header, + final int referenceSequenceIndex, + final int alignmentStart, + final short readNameLength, + final short mappingQuality, + final int indexingBin, + final int cigarLen, + final int flags, + final int readLen, + final int mateReferenceSequenceIndex, + final int mateAlignmentStart, + final int insertSize, + final byte[] variableLengthBlock) { + return new GATKSAMRecord(header, + referenceSequenceIndex, + alignmentStart, + readNameLength, + mappingQuality, + indexingBin, + cigarLen, + flags, + readLen, + mateReferenceSequenceIndex, + mateAlignmentStart, + insertSize, + variableLengthBlock); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index d1e9a236f..f8e4927ed 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -52,38 +52,6 @@ public class ReadUtils { // ---------------------------------------------------------------------------------------------------- public static final String REDUCED_READ_QUALITY_TAG = "RQ"; - public static final String REDUCED_READ_CONSENSUS_COUNTS_TAG = "CC"; - - public final static byte[] getReducedReadQualityTagValue(final SAMRecord read) { - return read.getByteArrayAttribute(ReadUtils.REDUCED_READ_QUALITY_TAG); - } - - public final static boolean isReducedRead(final SAMRecord read) { - return getReducedReadQualityTagValue(read) != null; - } - - public final static byte getReducedQual(final SAMRecord read, final int i) { - return read.getBaseQualities()[i]; - } - - public final static byte getReducedCount(final SAMRecord read, final int i) { - return getReducedReadQualityTagValue(read)[i]; - } - - public final static SAMRecord reducedReadWithReducedQuals(final SAMRecord read) { - if ( ! isReducedRead(read) ) throw new IllegalArgumentException("read must be a reduced read"); - return read; -// try { -// SAMRecord newRead = (SAMRecord)read.clone(); -// byte reducedQual = (byte)(int)getReducedReadQualityTagValue(read); -// byte[] newQuals = new byte[read.getBaseQualities().length]; -// Arrays.fill(newQuals, reducedQual); -// newRead.setBaseQualities(newQuals); -// return newRead; -// } catch ( CloneNotSupportedException e ) { -// throw new ReviewedStingException("SAMRecord no longer supports clone", e); -// } - } // ---------------------------------------------------------------------------------------------------- // diff --git a/public/java/test/org/broadinstitute/sting/BaseTest.java b/public/java/test/org/broadinstitute/sting/BaseTest.java index 8e11add33..f99a105ae 100755 --- a/public/java/test/org/broadinstitute/sting/BaseTest.java +++ b/public/java/test/org/broadinstitute/sting/BaseTest.java @@ -50,6 +50,7 @@ public abstract class BaseTest { public static final String hg18Reference = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"; public static final String hg19Reference = "/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta"; public static final String b36KGReference = "/humgen/1kg/reference/human_b36_both.fasta"; + //public static final String b37KGReference = "/Users/depristo/Desktop/broadLocal/localData/human_g1k_v37.fasta"; public static final String b37KGReference = "/humgen/1kg/reference/human_g1k_v37.fasta"; public static final String GATKDataLocation = "/humgen/gsa-hpprojects/GATK/data/"; public static final String validationDataLocation = GATKDataLocation + "Validation_Data/"; @@ -99,10 +100,10 @@ public abstract class BaseTest { logger.setLevel(Level.WARN); // find our file sources - if (!fileExist(hg18Reference) || !fileExist(hg19Reference) || !fileExist(b36KGReference)) { - logger.fatal("We can't locate the reference directories. Aborting!"); - throw new RuntimeException("BaseTest setup failed: unable to locate the reference directories"); - } +// if (!fileExist(hg18Reference) || !fileExist(hg19Reference) || !fileExist(b36KGReference)) { +// logger.fatal("We can't locate the reference directories. Aborting!"); +// throw new RuntimeException("BaseTest setup failed: unable to locate the reference directories"); +// } } /** diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/BAQIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/BAQIntegrationTest.java index 702ba9f4f..c7eb4d88b 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/BAQIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/BAQIntegrationTest.java @@ -18,7 +18,7 @@ public class BAQIntegrationTest extends WalkerTest { // -------------------------------------------------------------------------------------------------------------- @Test public void testPrintReadsNoBAQ() { - WalkerTestSpec spec = new WalkerTestSpec( baseCommand +" -baq OFF", 1, Arrays.asList("902197bf77ed5a828d50e08771685928")); + WalkerTestSpec spec = new WalkerTestSpec( baseCommand +" -baq OFF", 1, Arrays.asList("d97340a2bba2c6320d1ebeb86024a27c")); executeTest(String.format("testPrintReadsNoBAQ"), spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 6b6346447..7d6cfc7ad 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -224,7 +224,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("0bece77ce6bc447438ef9b2921b2dc41")); + Arrays.asList("eeba568272f9b42d5450da75c7cc6d2d")); executeTest(String.format("test indel caller in SLX"), spec); } @@ -252,7 +252,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("790b1a1d6ab79eee8c24812bb8ca6fae")); + Arrays.asList("19ff9bd3139480bdf79dcbf117cf2b24")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @@ -262,7 +262,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("408d3aba4d094c067fc00a43992c2292")); + Arrays.asList("118918f2e9e56a3cfc5ccb2856d529c8")); executeTest("test MultiSample Pilot2 indels with alleles passed in", spec1); } @@ -272,7 +272,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("5e4e09354410b76fc0d822050d84132a")); + Arrays.asList("a20799237accd52c1b8c2ac096309c8f")); executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec2); } @@ -282,7 +282,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2.20101123.indels.sites.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,080,000", 1, - Arrays.asList("c599eedbeb422713b8a28529e805e4ae")); + Arrays.asList("18ef8181157b4ac3eb8492f538467f92")); executeTest("test MultiSample Pilot2 indels with complicated records", spec3); } @@ -291,7 +291,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec( baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2_chr20_100_110K.20101123.indels.sites.vcf -I " + validationDataLocation + "phase1_GBR_realigned.chr20.100K-110K.bam -o %s -L 20:100,000-110,000", 1, - Arrays.asList("37d908a682ac269f8f19dec939ff5b01")); + Arrays.asList("ad884e511a751b05e64db5314314365a")); executeTest("test MultiSample 1000G Phase1 indels with complicated records emitting all sites", spec4); } diff --git a/public/java/test/org/broadinstitute/sting/utils/ReadUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/ReadUtilsUnitTest.java index cc0007439..59a6ecb8d 100755 --- a/public/java/test/org/broadinstitute/sting/utils/ReadUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/ReadUtilsUnitTest.java @@ -5,6 +5,7 @@ import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; import org.testng.Assert; import org.testng.annotations.BeforeTest; @@ -12,7 +13,7 @@ import org.testng.annotations.Test; public class ReadUtilsUnitTest extends BaseTest { - SAMRecord read, reducedRead; + GATKSAMRecord read, reducedRead; final static String BASES = "ACTG"; final static String QUALS = "!+5?"; final private static byte[] REDUCED_READ_COUNTS = new byte[]{10, 20, 30, 40}; @@ -47,13 +48,12 @@ public class ReadUtilsUnitTest extends BaseTest { @Test public void testReducedReads() { - Assert.assertFalse(ReadUtils.isReducedRead(read), "isReducedRead is false for normal read"); - Assert.assertEquals(ReadUtils.getReducedReadQualityTagValue(read), null, "No reduced read tag in normal read"); + Assert.assertFalse(read.isReducedRead(), "isReducedRead is false for normal read"); + Assert.assertEquals(read.getReducedReadCounts(), null, "No reduced read tag in normal read"); - Assert.assertTrue(ReadUtils.isReducedRead(reducedRead), "isReducedRead is true for reduced read"); + Assert.assertTrue(reducedRead.isReducedRead(), "isReducedRead is true for reduced read"); for ( int i = 0; i < reducedRead.getReadLength(); i++) { - Assert.assertEquals(ReadUtils.getReducedQual(reducedRead, i), read.getBaseQualities()[i], "Reduced read quality not set to the expected value at " + i); - Assert.assertEquals(ReadUtils.getReducedCount(reducedRead, i), REDUCED_READ_COUNTS[i], "Reduced read count not set to the expected value at " + i); + Assert.assertEquals(reducedRead.getReducedCount(i), REDUCED_READ_COUNTS[i], "Reduced read count not set to the expected value at " + i); } } diff --git a/public/java/test/org/broadinstitute/sting/utils/ReservoirDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/ReservoirDownsamplerUnitTest.java index 76dd5d341..0f19e2f90 100644 --- a/public/java/test/org/broadinstitute/sting/utils/ReservoirDownsamplerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/ReservoirDownsamplerUnitTest.java @@ -1,5 +1,6 @@ package org.broadinstitute.sting.utils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.Assert; import org.testng.annotations.Test; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; @@ -28,7 +29,7 @@ public class ReservoirDownsamplerUnitTest { @Test public void testOneElementWithPoolSizeOne() { - List reads = Collections.singletonList(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76)); + List reads = Collections.singletonList(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76)); ReservoirDownsampler downsampler = new ReservoirDownsampler(1); downsampler.addAll(reads); @@ -40,7 +41,7 @@ public class ReservoirDownsamplerUnitTest { @Test public void testOneElementWithPoolSizeGreaterThanOne() { - List reads = Collections.singletonList(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76)); + List reads = Collections.singletonList(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76)); ReservoirDownsampler downsampler = new ReservoirDownsampler(5); downsampler.addAll(reads); diff --git a/public/java/test/org/broadinstitute/sting/utils/pileup/FragmentPileupBenchmark.java b/public/java/test/org/broadinstitute/sting/utils/pileup/FragmentPileupBenchmark.java new file mode 100644 index 000000000..8b797def4 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/pileup/FragmentPileupBenchmark.java @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.pileup; + +import com.google.caliper.Param; +import com.google.caliper.SimpleBenchmark; +import com.google.caliper.runner.CaliperMain; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; + +import java.util.*; + +/** + * Caliper microbenchmark of fragment pileup + */ +public class FragmentPileupBenchmark extends SimpleBenchmark { + List pileups; + + @Param({"0", "4", "30", "150", "1000"}) + int pileupSize; // set automatically by framework + + @Param({"200", "400"}) + int insertSize; // set automatically by framework + + @Override protected void setUp() { + final int nPileupsToGenerate = 100; + pileups = new ArrayList(nPileupsToGenerate); + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); + GenomeLocParser genomeLocParser; + genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); + GenomeLoc loc = genomeLocParser.createGenomeLoc("chr1", 50); + final int readLen = 100; + + for ( int pileupN = 0; pileupN < nPileupsToGenerate; pileupN++ ) { + ReadBackedPileup rbp = ArtificialSAMUtils.createReadBackedPileup(header, loc, readLen, insertSize, pileupSize); + pileups.add(rbp); + } + } + + private void run(int rep, FragmentPileup.FragmentMatchingAlgorithm algorithm) { + int nFrags = 0; + for ( int i = 0; i < rep; i++ ) { + for ( ReadBackedPileup rbp : pileups ) + nFrags += new FragmentPileup(rbp, algorithm).getTwoReadPileup().size(); + } + } + + public void timeOriginal(int rep) { + run(rep, FragmentPileup.FragmentMatchingAlgorithm.ORIGINAL); + } + + public void timeSkipNonOverlapping(int rep) { + run(rep, FragmentPileup.FragmentMatchingAlgorithm.skipNonOverlapping); + } + + public static void main(String[] args) { + CaliperMain.main(FragmentPileupBenchmark.class, args); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/pileup/FragmentPileupUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/pileup/FragmentPileupUnitTest.java new file mode 100644 index 000000000..c42c01c65 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/pileup/FragmentPileupUnitTest.java @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.pileup; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMReadGroupRecord; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeTest; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +/** + * Test routines for read-backed pileup. + */ +public class FragmentPileupUnitTest extends BaseTest { + private static SAMFileHeader header; + + private class FragmentPileupTest extends TestDataProvider { + List states = new ArrayList(); + + private FragmentPileupTest(String name, int readLen, int leftStart, int rightStart, boolean leftIsFirst, boolean leftIsNegative) { + super(FragmentPileupTest.class, String.format("%s-leftIsFirst:%b-leftIsNegative:%b", name, leftIsFirst, leftIsNegative)); + + List pair = ArtificialSAMUtils.createPair(header, "readpair", readLen, leftStart, rightStart, leftIsFirst, leftIsNegative); + SAMRecord left = pair.get(0); + SAMRecord right = pair.get(1); + + for ( int pos = leftStart; pos < rightStart + readLen; pos++) { + boolean posCoveredByLeft = pos >= left.getAlignmentStart() && pos <= left.getAlignmentEnd(); + boolean posCoveredByRight = pos >= right.getAlignmentStart() && pos <= right.getAlignmentEnd(); + + if ( posCoveredByLeft || posCoveredByRight ) { + List reads = new ArrayList(); + List offsets = new ArrayList(); + + if ( posCoveredByLeft ) { + reads.add(left); + offsets.add(pos - left.getAlignmentStart()); + } + + if ( posCoveredByRight ) { + reads.add(right); + offsets.add(pos - right.getAlignmentStart()); + } + + boolean shouldBeFragment = posCoveredByLeft && posCoveredByRight; + ReadBackedPileup pileup = new ReadBackedPileupImpl(null, reads, offsets); + TestState testState = new TestState(shouldBeFragment, pileup); + states.add(testState); + } + } + } + } + + private class TestState { + boolean shouldBeFragment; + ReadBackedPileup pileup; + + private TestState(final boolean shouldBeFragment, final ReadBackedPileup pileup) { + this.shouldBeFragment = shouldBeFragment; + this.pileup = pileup; + } + } + + @DataProvider(name = "fragmentPileupTest") + public Object[][] createTests() { + for ( boolean leftIsFirst : Arrays.asList(true, false) ) { + for ( boolean leftIsNegative : Arrays.asList(true, false) ) { + // Overlapping pair + // ----> [first] + // <--- [second] + new FragmentPileupTest("overlapping-pair", 10, 1, 5, leftIsFirst, leftIsNegative); + + // Non-overlapping pair + // ----> + // <---- + new FragmentPileupTest("nonoverlapping-pair", 10, 1, 15, leftIsFirst, leftIsNegative); + } + } + + return FragmentPileupTest.getTests(FragmentPileupTest.class); + } + + @Test(enabled = true, dataProvider = "fragmentPileupTest") + public void testMe(FragmentPileupTest test) { + for ( TestState testState : test.states ) { + ReadBackedPileup rbp = testState.pileup; + FragmentPileup fp = new FragmentPileup(rbp); + Assert.assertEquals(fp.getTwoReadPileup().size(), testState.shouldBeFragment ? 1 : 0); + Assert.assertEquals(fp.getOneReadPileup().size(), testState.shouldBeFragment ? 0 : 1); + } + } + + @BeforeTest + public void setup() { + header = ArtificialSAMUtils.createArtificialSamHeader(1,1,1000); + } +}