From ec475a46b16f55092040bc99eca4badcbc69b02c Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Fri, 29 Mar 2013 10:10:13 -0400 Subject: [PATCH] Fixing @PG tag uniqueness issue The Problem: ------------ the SAM spec does not allow multiple @PG tags with the same id. Our @PG tag writing routines were allowing that to happen with the boolean parameter "keep_all_pg_records". How this fixes it: ------------------ This commit removes that option from all the utility functions and cleans up the code around the classes that used these methods off-spec. Summarized changes: ------------------- * Remove keep_all_pg_records option from setupWriter utility methos in Util * Update all walkers to now replace the last @PG tag of the same walker (if it already exists) * Cleanup NWaySamFileWriter now that it doesn't need to keep track of the keep_all_pg_records variable * Simplify the multiple implementations to setupWriter Bamboo: ------- http://gsabamboo.broadinstitute.org/browse/GSAUNSTABLE-PARALLEL31 Issue Tracker: -------------- [fixes 47100885] --- .../compression/reducereads/ReduceReads.java | 39 ++++--- .../gatk/walkers/readutils/PrintReads.java | 6 +- .../org/broadinstitute/sting/utils/Utils.java | 100 ++++-------------- .../sting/utils/sam/NWaySAMFileWriter.java | 2 +- 4 files changed, 42 insertions(+), 105 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java index 5e9429284..c9730e95a 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java @@ -123,7 +123,7 @@ public class ReduceReads extends ReadWalker, Redu * The number of bases to keep around mismatches (potential variation) */ @Argument(fullName = "context_size", shortName = "cs", doc = "", required = false) - private int contextSize = 10; + public int contextSize = 10; /** * The minimum mapping quality to be considered for the consensus synthetic read. Reads that have @@ -131,7 +131,7 @@ public class ReduceReads extends ReadWalker, Redu * towards variable regions. */ @Argument(fullName = "minimum_mapping_quality", shortName = "minmap", doc = "", required = false) - private int minMappingQuality = 20; + public int minMappingQuality = 20; /** * The minimum base quality to be considered for the consensus synthetic read. Reads that have @@ -139,14 +139,14 @@ public class ReduceReads extends ReadWalker, Redu * towards variable regions. */ @Argument(fullName = "minimum_base_quality_to_consider", shortName = "minqual", doc = "", required = false) - private byte minBaseQual = 20; + public byte minBaseQual = 20; /** * Reads have notoriously low quality bases on the tails (left and right). Consecutive bases with quality * lower than this threshold will be hard clipped off before entering the reduce reads algorithm. */ @Argument(fullName = "minimum_tail_qualities", shortName = "mintail", doc = "", required = false) - private byte minTailQuality = 2; + public byte minTailQuality = 2; /** * Any number of VCF files representing known SNPs to be used for the experimental polyploid-based reduction. @@ -161,21 +161,21 @@ public class ReduceReads extends ReadWalker, Redu * and read group). */ @Argument(fullName = "dont_simplify_reads", shortName = "nosimplify", doc = "", required = false) - private boolean DONT_SIMPLIFY_READS = false; + public boolean DONT_SIMPLIFY_READS = false; /** * Do not hard clip adaptor sequences. Note: You don't have to turn this on for reads that are not mate paired. * The program will behave correctly in those cases. */ @Argument(fullName = "dont_hardclip_adaptor_sequences", shortName = "noclip_ad", doc = "", required = false) - private boolean DONT_CLIP_ADAPTOR_SEQUENCES = false; + public boolean DONT_CLIP_ADAPTOR_SEQUENCES = false; /** * Do not hard clip the low quality tails of the reads. This option overrides the argument of minimum tail * quality. */ @Argument(fullName = "dont_hardclip_low_qual_tails", shortName = "noclip_tail", doc = "", required = false) - private boolean DONT_CLIP_LOW_QUAL_TAILS = false; + public boolean DONT_CLIP_LOW_QUAL_TAILS = false; /** * Do not use high quality soft-clipped bases. By default, ReduceReads will hard clip away any low quality soft clipped @@ -183,7 +183,7 @@ public class ReduceReads extends ReadWalker, Redu * regions. The minimum quality for soft clipped bases is the same as the minimum base quality to consider (minqual) */ @Argument(fullName = "dont_use_softclipped_bases", shortName = "no_soft", doc = "", required = false) - private boolean DONT_USE_SOFTCLIPPED_BASES = false; + public boolean DONT_USE_SOFTCLIPPED_BASES = false; /** * Do not compress read names. By default, ReduceReads will compress read names to numbers and guarantee @@ -191,55 +191,55 @@ public class ReduceReads extends ReadWalker, Redu * there is no guarantee that read name uniqueness will be maintained -- in this case we recommend not compressing. */ @Argument(fullName = "dont_compress_read_names", shortName = "nocmp_names", doc = "", required = false) - private boolean DONT_COMPRESS_READ_NAMES = false; + public boolean DONT_COMPRESS_READ_NAMES = false; /** * Optionally hard clip all incoming reads to the desired intervals. The hard clips will happen exactly at the interval * border. */ @Argument(fullName = "hard_clip_to_interval", shortName = "clip_int", doc = "", required = false) - private boolean HARD_CLIP_TO_INTERVAL = false; + public boolean HARD_CLIP_TO_INTERVAL = false; /** * Minimum proportion of mismatches in a site to trigger a variant region. Anything below this will be * considered consensus. */ @Argument(fullName = "minimum_alt_proportion_to_trigger_variant", shortName = "minvar", doc = "", required = false) - private double minAltProportionToTriggerVariant = 0.05; + public double minAltProportionToTriggerVariant = 0.05; /** * Minimum proportion of indels in a site to trigger a variant region. Anything below this will be * considered consensus. */ @Argument(fullName = "minimum_del_proportion_to_trigger_variant", shortName = "mindel", doc = "", required = false) - private double minIndelProportionToTriggerVariant = 0.05; + public double minIndelProportionToTriggerVariant = 0.05; /** * Downsamples the coverage of a variable region approximately (guarantees the minimum to be equal to this). * A value of 0 turns downsampling off. */ @Argument(fullName = "downsample_coverage", shortName = "ds", doc = "", required = false) - private int downsampleCoverage = 250; + public int downsampleCoverage = 250; @Hidden @Argument(fullName = "nwayout", shortName = "nw", doc = "", required = false) - private boolean nwayout = false; + public boolean nwayout = false; @Hidden @Argument(fullName = "", shortName = "dl", doc = "", required = false) - private int debugLevel = 0; + public int debugLevel = 0; @Hidden @Argument(fullName = "", shortName = "dr", doc = "", required = false) - private String debugRead = ""; + public String debugRead = ""; @Hidden @Argument(fullName = "downsample_strategy", shortName = "dm", doc = "", required = false) - private DownsampleStrategy downsampleStrategy = DownsampleStrategy.Normal; + public DownsampleStrategy downsampleStrategy = DownsampleStrategy.Normal; @Hidden @Argument(fullName = "no_pg_tag", shortName = "npt", doc ="", required = false) - private boolean NO_PG_TAG = false; + public boolean NO_PG_TAG = false; public enum DownsampleStrategy { Normal, @@ -282,7 +282,6 @@ public class ReduceReads extends ReadWalker, Redu final boolean preSorted = true; final boolean indexOnTheFly = true; - final boolean keep_records = true; final SAMFileHeader.SortOrder sortOrder = SAMFileHeader.SortOrder.coordinate; if (nwayout) { SAMProgramRecord programRecord = NO_PG_TAG ? null : Utils.createProgramRecord(toolkit, this, PROGRAM_RECORD_NAME); @@ -292,7 +291,7 @@ public class ReduceReads extends ReadWalker, Redu writerToUse = out; out.setPresorted(false); if (!NO_PG_TAG) { - Utils.setupWriter(out, toolkit, toolkit.getSAMFileHeader(), !preSorted, keep_records, this, PROGRAM_RECORD_NAME); + Utils.setupWriter(out, toolkit, toolkit.getSAMFileHeader(), !preSorted, this, PROGRAM_RECORD_NAME); } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/PrintReads.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/PrintReads.java index 16afc18fa..add567b36 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/PrintReads.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/PrintReads.java @@ -38,7 +38,6 @@ import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import org.broadinstitute.sting.gatk.iterators.ReadTransformersMode; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.baq.BAQ; @@ -151,7 +150,7 @@ public class PrintReads extends ReadWalker impleme @Hidden @Argument(fullName = "no_pg_tag", shortName = "npt", doc ="", required = false) - private boolean NO_PG_TAG = false; + public boolean NO_PG_TAG = false; List readTransformers = Collections.emptyList(); private TreeSet samplesToChoose = new TreeSet(); @@ -166,7 +165,6 @@ public class PrintReads extends ReadWalker impleme * The initialize function. */ public void initialize() { - final boolean keep_records = true; final GenomeAnalysisEngine toolkit = getToolkit(); if ( platform != null ) @@ -192,7 +190,7 @@ public class PrintReads extends ReadWalker impleme final boolean preSorted = true; if (getToolkit() != null && getToolkit().getArguments().BQSR_RECAL_FILE != null && !NO_PG_TAG ) { - Utils.setupWriter(out, toolkit, toolkit.getSAMFileHeader(), !preSorted, keep_records, this, PROGRAM_RECORD_NAME); + Utils.setupWriter(out, toolkit, toolkit.getSAMFileHeader(), !preSorted, this, PROGRAM_RECORD_NAME); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java index e50025ea1..ff0ea958c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java @@ -29,7 +29,6 @@ import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMProgramRecord; -import net.sf.samtools.util.StringUtil; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; @@ -87,9 +86,7 @@ public class Utils { * @return True if the two objects are equal, false otherwise. */ public static boolean equals(Object lhs, Object rhs) { - if (lhs == null && rhs == null) return true; - else if (lhs == null) return false; - else return lhs.equals(rhs); + return lhs == null && rhs == null || lhs != null && lhs.equals(rhs); } public static List cons(final T elt, final List l) { @@ -128,35 +125,6 @@ public class Utils { logger.warn(String.format("* %s", builder)); } - public static ArrayList subseq(char[] fullArray) { - byte[] fullByteArray = new byte[fullArray.length]; - StringUtil.charsToBytes(fullArray, 0, fullArray.length, fullByteArray, 0); - return subseq(fullByteArray); - } - - public static ArrayList subseq(byte[] fullArray) { - return subseq(fullArray, 0, fullArray.length - 1); - } - - public static ArrayList subseq(byte[] fullArray, int start, int end) { - assert end < fullArray.length; - ArrayList dest = new ArrayList(end - start + 1); - for (int i = start; i <= end; i++) { - dest.add(fullArray[i]); - } - return dest; - } - - public static String baseList2string(List bases) { - byte[] basesAsbytes = new byte[bases.size()]; - int i = 0; - for (Byte b : bases) { - basesAsbytes[i] = b; - i++; - } - return new String(basesAsbytes); - } - /** * join the key value pairs of a map into one string, i.e. myMap = [A->1,B->2,C->3] with a call of: * joinMap("-","*",myMap) -> returns A-1*B-2*C-3 @@ -255,7 +223,6 @@ public class Utils { * Create a new list that contains the elements of left along with elements elts * @param left a non-null list of elements * @param elts a varargs vector for elts to append in order to left - * @param * @return A newly allocated linked list containing left followed by elts */ public static List append(final List left, T ... elts) { @@ -267,9 +234,9 @@ public class Utils { /** * Returns a string of the values in joined by separator, such as A,B,C * - * @param separator - * @param doubles - * @return + * @param separator separator character + * @param doubles the array with values + * @return a string with the values separated by the separator */ public static String join(String separator, double[] doubles) { if ( doubles == null || doubles.length == 0) @@ -486,7 +453,7 @@ public class Utils { return rcbases; } - static public final List reverse(final List l) { + static public List reverse(final List l) { final List newL = new ArrayList(l); Collections.reverse(newL); return newL; @@ -525,10 +492,8 @@ public class Utils { /** * Helper utility that calls into the InetAddress system to resolve the hostname. If this fails, * unresolvable gets returned instead. - * - * @return */ - public static final String resolveHostname() { + public static String resolveHostname() { try { return InetAddress.getLocalHost().getCanonicalHostName(); } @@ -555,17 +520,15 @@ public class Utils { * Creates a program record for the program, adds it to the list of program records (@PG tags) in the bam file and sets * up the writer with the header and presorted status. * - * @param toolkit the engine * @param originalHeader original header - * @param KEEP_ALL_PG_RECORDS whether or not to keep all the other program records already existing in this BAM file * @param programRecord the program record for this program */ - public static SAMFileHeader setupWriter(GenomeAnalysisEngine toolkit, SAMFileHeader originalHeader, boolean KEEP_ALL_PG_RECORDS, SAMProgramRecord programRecord) { - SAMFileHeader header = originalHeader.clone(); - List oldRecords = header.getProgramRecords(); - List newRecords = new ArrayList(oldRecords.size()+1); + public static SAMFileHeader setupWriter(final SAMFileHeader originalHeader, final SAMProgramRecord programRecord) { + final SAMFileHeader header = originalHeader.clone(); + final List oldRecords = header.getProgramRecords(); + final List newRecords = new ArrayList(oldRecords.size()+1); for ( SAMProgramRecord record : oldRecords ) - if ( (programRecord != null && !record.getId().startsWith(programRecord.getId())) || KEEP_ALL_PG_RECORDS ) + if ( (programRecord != null && !record.getId().startsWith(programRecord.getId()))) newRecords.add(record); if (programRecord != null) { @@ -580,14 +543,13 @@ public class Utils { * the new header to be added to the BAM writer. * * @param toolkit the engine - * @param KEEP_ALL_PG_RECORDS whether or not to keep all the other program records already existing in this BAM file * @param walker the walker object (so we can extract the command line) * @param PROGRAM_RECORD_NAME the name for the PG tag * @return a pre-filled header for the bam writer */ - public static SAMFileHeader setupWriter(GenomeAnalysisEngine toolkit, SAMFileHeader originalHeader, boolean KEEP_ALL_PG_RECORDS, Object walker, String PROGRAM_RECORD_NAME) { + public static SAMFileHeader setupWriter(final GenomeAnalysisEngine toolkit, final SAMFileHeader originalHeader, final Object walker, final String PROGRAM_RECORD_NAME) { final SAMProgramRecord programRecord = createProgramRecord(toolkit, walker, PROGRAM_RECORD_NAME); - return setupWriter(toolkit, originalHeader, KEEP_ALL_PG_RECORDS, programRecord); + return setupWriter(originalHeader, programRecord); } /** @@ -597,12 +559,11 @@ public class Utils { * @param writer BAM file writer * @param toolkit the engine * @param preSorted whether or not the writer can assume reads are going to be added are already sorted - * @param KEEP_ALL_PG_RECORDS whether or not to keep all the other program records already existing in this BAM file * @param walker the walker object (so we can extract the command line) * @param PROGRAM_RECORD_NAME the name for the PG tag */ - public static void setupWriter(StingSAMFileWriter writer, GenomeAnalysisEngine toolkit, SAMFileHeader originalHeader, boolean preSorted, boolean KEEP_ALL_PG_RECORDS, Object walker, String PROGRAM_RECORD_NAME) { - SAMFileHeader header = setupWriter(toolkit, originalHeader, KEEP_ALL_PG_RECORDS, walker, PROGRAM_RECORD_NAME); + public static void setupWriter(StingSAMFileWriter writer, GenomeAnalysisEngine toolkit, SAMFileHeader originalHeader, boolean preSorted, Object walker, String PROGRAM_RECORD_NAME) { + SAMFileHeader header = setupWriter(toolkit, originalHeader, walker, PROGRAM_RECORD_NAME); writer.writeHeader(header); writer.setPresorted(preSorted); } @@ -629,23 +590,11 @@ public class Utils { return programRecord; } - public static Collection makeCollection(Iterable iter) { - Collection list = new ArrayList(); - for (E item : iter) { - list.add(item); - } - return list; - } - /** * Returns the number of combinations represented by this collection * of collection of options. * * For example, if this is [[A, B], [C, D], [E, F, G]] returns 2 * 2 * 3 = 12 - * - * @param options - * @param - * @return */ @Requires("options != null") public static int nCombinations(final Collection[] options) { @@ -676,21 +625,18 @@ public class Utils { * if N = 1 => [[A], [B], [C]] * if N = 2 => [[A, A], [B, A], [C, A], [A, B], [B, B], [C, B], [A, C], [B, C], [C, C]] * - * @param objects - * @param n - * @param + * @param objects list of objects + * @param n size of each combination * @param withReplacement if false, the resulting permutations will only contain unique objects from objects - * @return + * @return a list with all combinations with size n of objects. */ public static List> makePermutations(final List objects, final int n, final boolean withReplacement) { final List> combinations = new ArrayList>(); - if ( n <= 0 ) - ; - else if ( n == 1 ) { + if ( n == 1 ) { for ( final T o : objects ) combinations.add(Collections.singletonList(o)); - } else { + } else if (n > 1) { final List> sub = makePermutations(objects, n - 1, withReplacement); for ( List subI : sub ) { for ( final T a : objects ) { @@ -738,9 +684,6 @@ public class Utils { /** * Create a constant map that maps each value in values to itself - * @param values - * @param - * @return */ public static Map makeIdentityFunctionMap(Collection values) { Map map = new HashMap(values.size()); @@ -756,9 +699,6 @@ public class Utils { * groupSize = 2 * result = [[A, B], [C, D], [E]] * - * @param list - * @param groupSize - * @return */ public static List> groupList(final List list, final int groupSize) { if ( groupSize < 1 ) throw new IllegalArgumentException("groupSize >= 1"); diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java b/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java index d26a1f807..4cd361ba1 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java @@ -141,7 +141,7 @@ public class NWaySAMFileWriter implements SAMFileWriter { private void addWriter(SAMReaderID id , String outName, SAMFileHeader.SortOrder order, boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord programRecord) { File f = new File(outName); - SAMFileHeader header = Utils.setupWriter(toolkit, toolkit.getSAMFileHeader(id), KEEP_ALL_PG_RECORDS, programRecord); + SAMFileHeader header = Utils.setupWriter(toolkit.getSAMFileHeader(id), programRecord); SAMFileWriterFactory factory = new SAMFileWriterFactory(); factory.setCreateIndex(indexOnTheFly); factory.setCreateMd5File(generateMD5);