Merge pull request #135 from broadinstitute/mc_pgtag_fix

Fixing @PG tag uniqueness issue
This commit is contained in:
Mark DePristo 2013-03-31 11:36:40 -07:00
commit 7c83efc1b9
4 changed files with 42 additions and 105 deletions

View File

@ -123,7 +123,7 @@ public class ReduceReads extends ReadWalker<ObjectArrayList<GATKSAMRecord>, Redu
* The number of bases to keep around mismatches (potential variation)
*/
@Argument(fullName = "context_size", shortName = "cs", doc = "", required = false)
private int contextSize = 10;
public int contextSize = 10;
/**
* The minimum mapping quality to be considered for the consensus synthetic read. Reads that have
@ -131,7 +131,7 @@ public class ReduceReads extends ReadWalker<ObjectArrayList<GATKSAMRecord>, Redu
* towards variable regions.
*/
@Argument(fullName = "minimum_mapping_quality", shortName = "minmap", doc = "", required = false)
private int minMappingQuality = 20;
public int minMappingQuality = 20;
/**
* The minimum base quality to be considered for the consensus synthetic read. Reads that have
@ -139,14 +139,14 @@ public class ReduceReads extends ReadWalker<ObjectArrayList<GATKSAMRecord>, Redu
* towards variable regions.
*/
@Argument(fullName = "minimum_base_quality_to_consider", shortName = "minqual", doc = "", required = false)
private byte minBaseQual = 20;
public byte minBaseQual = 20;
/**
* Reads have notoriously low quality bases on the tails (left and right). Consecutive bases with quality
* lower than this threshold will be hard clipped off before entering the reduce reads algorithm.
*/
@Argument(fullName = "minimum_tail_qualities", shortName = "mintail", doc = "", required = false)
private byte minTailQuality = 2;
public byte minTailQuality = 2;
/**
* Any number of VCF files representing known SNPs to be used for the experimental polyploid-based reduction.
@ -161,21 +161,21 @@ public class ReduceReads extends ReadWalker<ObjectArrayList<GATKSAMRecord>, Redu
* and read group).
*/
@Argument(fullName = "dont_simplify_reads", shortName = "nosimplify", doc = "", required = false)
private boolean DONT_SIMPLIFY_READS = false;
public boolean DONT_SIMPLIFY_READS = false;
/**
* Do not hard clip adaptor sequences. Note: You don't have to turn this on for reads that are not mate paired.
* The program will behave correctly in those cases.
*/
@Argument(fullName = "dont_hardclip_adaptor_sequences", shortName = "noclip_ad", doc = "", required = false)
private boolean DONT_CLIP_ADAPTOR_SEQUENCES = false;
public boolean DONT_CLIP_ADAPTOR_SEQUENCES = false;
/**
* Do not hard clip the low quality tails of the reads. This option overrides the argument of minimum tail
* quality.
*/
@Argument(fullName = "dont_hardclip_low_qual_tails", shortName = "noclip_tail", doc = "", required = false)
private boolean DONT_CLIP_LOW_QUAL_TAILS = false;
public boolean DONT_CLIP_LOW_QUAL_TAILS = false;
/**
* Do not use high quality soft-clipped bases. By default, ReduceReads will hard clip away any low quality soft clipped
@ -183,7 +183,7 @@ public class ReduceReads extends ReadWalker<ObjectArrayList<GATKSAMRecord>, Redu
* regions. The minimum quality for soft clipped bases is the same as the minimum base quality to consider (minqual)
*/
@Argument(fullName = "dont_use_softclipped_bases", shortName = "no_soft", doc = "", required = false)
private boolean DONT_USE_SOFTCLIPPED_BASES = false;
public boolean DONT_USE_SOFTCLIPPED_BASES = false;
/**
* Do not compress read names. By default, ReduceReads will compress read names to numbers and guarantee
@ -191,55 +191,55 @@ public class ReduceReads extends ReadWalker<ObjectArrayList<GATKSAMRecord>, Redu
* there is no guarantee that read name uniqueness will be maintained -- in this case we recommend not compressing.
*/
@Argument(fullName = "dont_compress_read_names", shortName = "nocmp_names", doc = "", required = false)
private boolean DONT_COMPRESS_READ_NAMES = false;
public boolean DONT_COMPRESS_READ_NAMES = false;
/**
* Optionally hard clip all incoming reads to the desired intervals. The hard clips will happen exactly at the interval
* border.
*/
@Argument(fullName = "hard_clip_to_interval", shortName = "clip_int", doc = "", required = false)
private boolean HARD_CLIP_TO_INTERVAL = false;
public boolean HARD_CLIP_TO_INTERVAL = false;
/**
* Minimum proportion of mismatches in a site to trigger a variant region. Anything below this will be
* considered consensus.
*/
@Argument(fullName = "minimum_alt_proportion_to_trigger_variant", shortName = "minvar", doc = "", required = false)
private double minAltProportionToTriggerVariant = 0.05;
public double minAltProportionToTriggerVariant = 0.05;
/**
* Minimum proportion of indels in a site to trigger a variant region. Anything below this will be
* considered consensus.
*/
@Argument(fullName = "minimum_del_proportion_to_trigger_variant", shortName = "mindel", doc = "", required = false)
private double minIndelProportionToTriggerVariant = 0.05;
public double minIndelProportionToTriggerVariant = 0.05;
/**
* Downsamples the coverage of a variable region approximately (guarantees the minimum to be equal to this).
* A value of 0 turns downsampling off.
*/
@Argument(fullName = "downsample_coverage", shortName = "ds", doc = "", required = false)
private int downsampleCoverage = 250;
public int downsampleCoverage = 250;
@Hidden
@Argument(fullName = "nwayout", shortName = "nw", doc = "", required = false)
private boolean nwayout = false;
public boolean nwayout = false;
@Hidden
@Argument(fullName = "", shortName = "dl", doc = "", required = false)
private int debugLevel = 0;
public int debugLevel = 0;
@Hidden
@Argument(fullName = "", shortName = "dr", doc = "", required = false)
private String debugRead = "";
public String debugRead = "";
@Hidden
@Argument(fullName = "downsample_strategy", shortName = "dm", doc = "", required = false)
private DownsampleStrategy downsampleStrategy = DownsampleStrategy.Normal;
public DownsampleStrategy downsampleStrategy = DownsampleStrategy.Normal;
@Hidden
@Argument(fullName = "no_pg_tag", shortName = "npt", doc ="", required = false)
private boolean NO_PG_TAG = false;
public boolean NO_PG_TAG = false;
public enum DownsampleStrategy {
Normal,
@ -282,7 +282,6 @@ public class ReduceReads extends ReadWalker<ObjectArrayList<GATKSAMRecord>, Redu
final boolean preSorted = true;
final boolean indexOnTheFly = true;
final boolean keep_records = true;
final SAMFileHeader.SortOrder sortOrder = SAMFileHeader.SortOrder.coordinate;
if (nwayout) {
SAMProgramRecord programRecord = NO_PG_TAG ? null : Utils.createProgramRecord(toolkit, this, PROGRAM_RECORD_NAME);
@ -292,7 +291,7 @@ public class ReduceReads extends ReadWalker<ObjectArrayList<GATKSAMRecord>, Redu
writerToUse = out;
out.setPresorted(false);
if (!NO_PG_TAG) {
Utils.setupWriter(out, toolkit, toolkit.getSAMFileHeader(), !preSorted, keep_records, this, PROGRAM_RECORD_NAME);
Utils.setupWriter(out, toolkit, toolkit.getSAMFileHeader(), !preSorted, this, PROGRAM_RECORD_NAME);
}
}
}

View File

@ -38,7 +38,6 @@ import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
import org.broadinstitute.sting.gatk.iterators.ReadTransformersMode;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.baq.BAQ;
@ -151,7 +150,7 @@ public class PrintReads extends ReadWalker<GATKSAMRecord, SAMFileWriter> impleme
@Hidden
@Argument(fullName = "no_pg_tag", shortName = "npt", doc ="", required = false)
private boolean NO_PG_TAG = false;
public boolean NO_PG_TAG = false;
List<ReadTransformer> readTransformers = Collections.emptyList();
private TreeSet<String> samplesToChoose = new TreeSet<String>();
@ -166,7 +165,6 @@ public class PrintReads extends ReadWalker<GATKSAMRecord, SAMFileWriter> impleme
* The initialize function.
*/
public void initialize() {
final boolean keep_records = true;
final GenomeAnalysisEngine toolkit = getToolkit();
if ( platform != null )
@ -192,7 +190,7 @@ public class PrintReads extends ReadWalker<GATKSAMRecord, SAMFileWriter> impleme
final boolean preSorted = true;
if (getToolkit() != null && getToolkit().getArguments().BQSR_RECAL_FILE != null && !NO_PG_TAG ) {
Utils.setupWriter(out, toolkit, toolkit.getSAMFileHeader(), !preSorted, keep_records, this, PROGRAM_RECORD_NAME);
Utils.setupWriter(out, toolkit, toolkit.getSAMFileHeader(), !preSorted, this, PROGRAM_RECORD_NAME);
}
}

View File

@ -29,7 +29,6 @@ import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMProgramRecord;
import net.sf.samtools.util.StringUtil;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
@ -87,9 +86,7 @@ public class Utils {
* @return True if the two objects are equal, false otherwise.
*/
public static boolean equals(Object lhs, Object rhs) {
if (lhs == null && rhs == null) return true;
else if (lhs == null) return false;
else return lhs.equals(rhs);
return lhs == null && rhs == null || lhs != null && lhs.equals(rhs);
}
public static <T> List<T> cons(final T elt, final List<T> l) {
@ -128,35 +125,6 @@ public class Utils {
logger.warn(String.format("* %s", builder));
}
public static ArrayList<Byte> subseq(char[] fullArray) {
byte[] fullByteArray = new byte[fullArray.length];
StringUtil.charsToBytes(fullArray, 0, fullArray.length, fullByteArray, 0);
return subseq(fullByteArray);
}
public static ArrayList<Byte> subseq(byte[] fullArray) {
return subseq(fullArray, 0, fullArray.length - 1);
}
public static ArrayList<Byte> subseq(byte[] fullArray, int start, int end) {
assert end < fullArray.length;
ArrayList<Byte> dest = new ArrayList<Byte>(end - start + 1);
for (int i = start; i <= end; i++) {
dest.add(fullArray[i]);
}
return dest;
}
public static String baseList2string(List<Byte> bases) {
byte[] basesAsbytes = new byte[bases.size()];
int i = 0;
for (Byte b : bases) {
basesAsbytes[i] = b;
i++;
}
return new String(basesAsbytes);
}
/**
* join the key value pairs of a map into one string, i.e. myMap = [A->1,B->2,C->3] with a call of:
* joinMap("-","*",myMap) -> returns A-1*B-2*C-3
@ -255,7 +223,6 @@ public class Utils {
* Create a new list that contains the elements of left along with elements elts
* @param left a non-null list of elements
* @param elts a varargs vector for elts to append in order to left
* @param <T>
* @return A newly allocated linked list containing left followed by elts
*/
public static <T> List<T> append(final List<T> left, T ... elts) {
@ -267,9 +234,9 @@ public class Utils {
/**
* Returns a string of the values in joined by separator, such as A,B,C
*
* @param separator
* @param doubles
* @return
* @param separator separator character
* @param doubles the array with values
* @return a string with the values separated by the separator
*/
public static String join(String separator, double[] doubles) {
if ( doubles == null || doubles.length == 0)
@ -486,7 +453,7 @@ public class Utils {
return rcbases;
}
static public final <T> List<T> reverse(final List<T> l) {
static public <T> List<T> reverse(final List<T> l) {
final List<T> newL = new ArrayList<T>(l);
Collections.reverse(newL);
return newL;
@ -525,10 +492,8 @@ public class Utils {
/**
* Helper utility that calls into the InetAddress system to resolve the hostname. If this fails,
* unresolvable gets returned instead.
*
* @return
*/
public static final String resolveHostname() {
public static String resolveHostname() {
try {
return InetAddress.getLocalHost().getCanonicalHostName();
}
@ -555,17 +520,15 @@ public class Utils {
* Creates a program record for the program, adds it to the list of program records (@PG tags) in the bam file and sets
* up the writer with the header and presorted status.
*
* @param toolkit the engine
* @param originalHeader original header
* @param KEEP_ALL_PG_RECORDS whether or not to keep all the other program records already existing in this BAM file
* @param programRecord the program record for this program
*/
public static SAMFileHeader setupWriter(GenomeAnalysisEngine toolkit, SAMFileHeader originalHeader, boolean KEEP_ALL_PG_RECORDS, SAMProgramRecord programRecord) {
SAMFileHeader header = originalHeader.clone();
List<SAMProgramRecord> oldRecords = header.getProgramRecords();
List<SAMProgramRecord> newRecords = new ArrayList<SAMProgramRecord>(oldRecords.size()+1);
public static SAMFileHeader setupWriter(final SAMFileHeader originalHeader, final SAMProgramRecord programRecord) {
final SAMFileHeader header = originalHeader.clone();
final List<SAMProgramRecord> oldRecords = header.getProgramRecords();
final List<SAMProgramRecord> newRecords = new ArrayList<SAMProgramRecord>(oldRecords.size()+1);
for ( SAMProgramRecord record : oldRecords )
if ( (programRecord != null && !record.getId().startsWith(programRecord.getId())) || KEEP_ALL_PG_RECORDS )
if ( (programRecord != null && !record.getId().startsWith(programRecord.getId())))
newRecords.add(record);
if (programRecord != null) {
@ -580,14 +543,13 @@ public class Utils {
* the new header to be added to the BAM writer.
*
* @param toolkit the engine
* @param KEEP_ALL_PG_RECORDS whether or not to keep all the other program records already existing in this BAM file
* @param walker the walker object (so we can extract the command line)
* @param PROGRAM_RECORD_NAME the name for the PG tag
* @return a pre-filled header for the bam writer
*/
public static SAMFileHeader setupWriter(GenomeAnalysisEngine toolkit, SAMFileHeader originalHeader, boolean KEEP_ALL_PG_RECORDS, Object walker, String PROGRAM_RECORD_NAME) {
public static SAMFileHeader setupWriter(final GenomeAnalysisEngine toolkit, final SAMFileHeader originalHeader, final Object walker, final String PROGRAM_RECORD_NAME) {
final SAMProgramRecord programRecord = createProgramRecord(toolkit, walker, PROGRAM_RECORD_NAME);
return setupWriter(toolkit, originalHeader, KEEP_ALL_PG_RECORDS, programRecord);
return setupWriter(originalHeader, programRecord);
}
/**
@ -597,12 +559,11 @@ public class Utils {
* @param writer BAM file writer
* @param toolkit the engine
* @param preSorted whether or not the writer can assume reads are going to be added are already sorted
* @param KEEP_ALL_PG_RECORDS whether or not to keep all the other program records already existing in this BAM file
* @param walker the walker object (so we can extract the command line)
* @param PROGRAM_RECORD_NAME the name for the PG tag
*/
public static void setupWriter(StingSAMFileWriter writer, GenomeAnalysisEngine toolkit, SAMFileHeader originalHeader, boolean preSorted, boolean KEEP_ALL_PG_RECORDS, Object walker, String PROGRAM_RECORD_NAME) {
SAMFileHeader header = setupWriter(toolkit, originalHeader, KEEP_ALL_PG_RECORDS, walker, PROGRAM_RECORD_NAME);
public static void setupWriter(StingSAMFileWriter writer, GenomeAnalysisEngine toolkit, SAMFileHeader originalHeader, boolean preSorted, Object walker, String PROGRAM_RECORD_NAME) {
SAMFileHeader header = setupWriter(toolkit, originalHeader, walker, PROGRAM_RECORD_NAME);
writer.writeHeader(header);
writer.setPresorted(preSorted);
}
@ -629,23 +590,11 @@ public class Utils {
return programRecord;
}
public static <E> Collection<E> makeCollection(Iterable<E> iter) {
Collection<E> list = new ArrayList<E>();
for (E item : iter) {
list.add(item);
}
return list;
}
/**
* Returns the number of combinations represented by this collection
* of collection of options.
*
* For example, if this is [[A, B], [C, D], [E, F, G]] returns 2 * 2 * 3 = 12
*
* @param options
* @param <T>
* @return
*/
@Requires("options != null")
public static <T> int nCombinations(final Collection<T>[] options) {
@ -676,21 +625,18 @@ public class Utils {
* if N = 1 => [[A], [B], [C]]
* if N = 2 => [[A, A], [B, A], [C, A], [A, B], [B, B], [C, B], [A, C], [B, C], [C, C]]
*
* @param objects
* @param n
* @param <T>
* @param objects list of objects
* @param n size of each combination
* @param withReplacement if false, the resulting permutations will only contain unique objects from objects
* @return
* @return a list with all combinations with size n of objects.
*/
public static <T> List<List<T>> makePermutations(final List<T> objects, final int n, final boolean withReplacement) {
final List<List<T>> combinations = new ArrayList<List<T>>();
if ( n <= 0 )
;
else if ( n == 1 ) {
if ( n == 1 ) {
for ( final T o : objects )
combinations.add(Collections.singletonList(o));
} else {
} else if (n > 1) {
final List<List<T>> sub = makePermutations(objects, n - 1, withReplacement);
for ( List<T> subI : sub ) {
for ( final T a : objects ) {
@ -738,9 +684,6 @@ public class Utils {
/**
* Create a constant map that maps each value in values to itself
* @param values
* @param <T>
* @return
*/
public static <T> Map<T, T> makeIdentityFunctionMap(Collection<T> values) {
Map<T,T> map = new HashMap<T, T>(values.size());
@ -756,9 +699,6 @@ public class Utils {
* groupSize = 2
* result = [[A, B], [C, D], [E]]
*
* @param list
* @param groupSize
* @return
*/
public static <T> List<List<T>> groupList(final List<T> list, final int groupSize) {
if ( groupSize < 1 ) throw new IllegalArgumentException("groupSize >= 1");

View File

@ -141,7 +141,7 @@ public class NWaySAMFileWriter implements SAMFileWriter {
private void addWriter(SAMReaderID id , String outName, SAMFileHeader.SortOrder order, boolean presorted,
boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord programRecord) {
File f = new File(outName);
SAMFileHeader header = Utils.setupWriter(toolkit, toolkit.getSAMFileHeader(id), KEEP_ALL_PG_RECORDS, programRecord);
SAMFileHeader header = Utils.setupWriter(toolkit.getSAMFileHeader(id), programRecord);
SAMFileWriterFactory factory = new SAMFileWriterFactory();
factory.setCreateIndex(indexOnTheFly);
factory.setCreateMd5File(generateMD5);