From 5c9b659c02aef41527ee07cd9c2e472a23de977b Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Wed, 28 Sep 2011 12:10:47 -0400 Subject: [PATCH 1/4] clipping both ends of the reads was modifying the original read This goes against the ReadClipper contract, and was affecting the second part of the read that spans over multiple intervals. Fixed. --- .../broadinstitute/sting/utils/clipreads/ReadClipper.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java b/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java index 83db20238..275f476dc 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java @@ -93,8 +93,9 @@ public class ReadClipper { public SAMRecord hardClipBothEndsByReferenceCoordinates(int left, int right) { if (left == right) return new SAMRecord(read.getHeader()); - this.read = hardClipByReferenceCoordinates(right, -1); - return hardClipByReferenceCoordinates(-1, left); + SAMRecord leftTailRead = hardClipByReferenceCoordinates(right, -1); + ReadClipper clipper = new ReadClipper(leftTailRead); + return clipper.hardClipByReferenceCoordinatesLeftTail(left); } public SAMRecord hardClipLowQualEnds(byte lowQual) { From 3c7b7f74ef235ed9933c8227b2c89cb68b3374b0 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Wed, 28 Sep 2011 14:41:30 -0400 Subject: [PATCH 2/4] Optimized interval iteration Using a TreedSet to manipulate getToolkit.getIntervals() and being smart about which intervals to test makes interval clipping O(1) instead of O(n). --- .../sting/utils/GenomeLocComparator.java | 56 +++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 public/java/src/org/broadinstitute/sting/utils/GenomeLocComparator.java diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLocComparator.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLocComparator.java new file mode 100644 index 000000000..7aa9fdd65 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLocComparator.java @@ -0,0 +1,56 @@ +package org.broadinstitute.sting.utils; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; + +import java.util.Comparator; + +/** + * + * @author Mauricio Carneiro + * @since 9/28/11 + */ +public class GenomeLocComparator implements Comparator { + /** + * compares genomeLoc's contigs + * + * @param gl1 the genome loc to compare contigs + * @param gl2 the genome loc to compare contigs + * @return 0 if equal, -1 if gl2.contig is greater, 1 if gl1.contig is greater + */ + @Requires("gl2 != null") + @Ensures("result == 0 || result == 1 || result == -1") + public final int compareContigs( GenomeLoc gl1, GenomeLoc gl2 ) { + if (gl1.contigIndex == gl2.contigIndex) + return 0; + else if (gl1.contigIndex > gl2.contigIndex) + return 1; + return -1; + } + + @Requires("gl2 != null") + @Ensures("result == 0 || result == 1 || result == -1") + public int compare ( GenomeLoc gl1, GenomeLoc gl2 ) { + int result = 0; + + if ( gl1 == gl2 ) { + result = 0; + } + else if(GenomeLoc.isUnmapped(gl1)) + result = 1; + else if(GenomeLoc.isUnmapped(gl2)) + result = -1; + else { + final int cmpContig = compareContigs(gl1, gl2); + + if ( cmpContig != 0 ) { + result = cmpContig; + } else { + if ( gl1.getStart() < gl2.getStart() ) result = -1; + if ( gl1.getStart() > gl2.getStart() ) result = 1; + } + } + + return result; + } +} From ff2f4df043eae05bfc338898155a34bb50becd06 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Wed, 28 Sep 2011 16:07:00 -0400 Subject: [PATCH 3/4] Fixed hardclipping inside indel (right tail) when hard clipping the right tail of a read falls inside a deletion, clipping should fall back to the last base before the deletion to follow the ReadClipper's contract. --- .../src/org/broadinstitute/sting/utils/sam/ReadUtils.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index 14ebbaa6f..e0a3a5a53 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -893,6 +893,10 @@ public class ReadUtils { // base before the deletion (see warning in function contracts) else if (fallsInsideDeletion && !endsWithinCigar) readBases += shift - 1; + + // If we reached our goal inside a deletion then we must backtrack to the last base before the deletion + else if (fallsInsideDeletion && endsWithinCigar) + readBases--; } } From 3b73dc89fe94246eeb6f8884e1786f3e30c20678 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Wed, 28 Sep 2011 16:17:31 -0400 Subject: [PATCH 4/4] Making several esoteric arguments in the BQSR @Hidden. Adding basic support for Complete Genomics machine cycle. --- .../gatk/walkers/recalibration/CycleCovariate.java | 13 +++++++++---- .../walkers/recalibration/RecalDataManager.java | 6 ++---- .../RecalibrationArgumentCollection.java | 8 ++++++++ 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java index 945d02837..5d07922a7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java @@ -159,10 +159,11 @@ public class CycleCovariate implements StandardCovariate { */ // todo -- this should be put into a common place in the code base - private static List PACBIO_NAMES = Arrays.asList("PACBIO"); private static List ILLUMINA_NAMES = Arrays.asList("ILLUMINA", "SLX", "SOLEXA"); private static List SOLID_NAMES = Arrays.asList("SOLID"); private static List LS454_NAMES = Arrays.asList("454"); + private static List COMPLETE_GENOMICS_NAMES = Arrays.asList("COMPLETE"); + private static List PACBIO_NAMES = Arrays.asList("PACBIO"); private static boolean isPlatform(SAMRecord read, List names) { String pl = read.getReadGroup().getPlatform().toUpperCase(); @@ -176,11 +177,10 @@ public class CycleCovariate implements StandardCovariate { public void getValues(SAMRecord read, Comparable[] comparable) { //----------------------------- - // ILLUMINA and SOLID + // Illumina, Solid, PacBio, and Complete Genomics //----------------------------- - - if( isPlatform(read, ILLUMINA_NAMES) || isPlatform(read, SOLID_NAMES) || isPlatform(read, PACBIO_NAMES)) { + if( isPlatform(read, ILLUMINA_NAMES) || isPlatform(read, SOLID_NAMES) || isPlatform(read, PACBIO_NAMES) || isPlatform(read, COMPLETE_GENOMICS_NAMES) ) { final int init; final int increment; if( !read.getReadNegativeStrandFlag() ) { @@ -222,6 +222,11 @@ public class CycleCovariate implements StandardCovariate { cycle += increment; } } + + //----------------------------- + // 454 + //----------------------------- + else if ( isPlatform(read, LS454_NAMES) ) { // Some bams have "LS454" and others have just "454" final int readLength = read.getReadLength(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java index ac25d4f13..2daa8c025 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java @@ -245,8 +245,7 @@ public class RecalDataManager { readGroup.setPlatform( RAC.DEFAULT_PLATFORM ); ((GATKSAMRecord)read).setReadGroup( readGroup ); } else { - throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no read group. First observed at read with name = " + read.getReadName() + - " Users must set both the default read group using the --default_read_group argument and the default platform using the --default_platform argument." ); + throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no read group. First observed at read with name = " + read.getReadName() ); } } @@ -271,8 +270,7 @@ public class RecalDataManager { } readGroup.setPlatform( RAC.DEFAULT_PLATFORM ); } else { - throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no platform information. First observed at read with name = " + read.getReadName() + - " Users must set the default platform using the --default_platform argument." ); + throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no platform information. First observed at read with name = " + read.getReadName() ); } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java index f31e2fc5b..75de84cb4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java @@ -26,6 +26,7 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Hidden; /** * Created by IntelliJ IDEA. @@ -41,22 +42,29 @@ public class RecalibrationArgumentCollection { ////////////////////////////////// // Shared Command Line Arguments ////////////////////////////////// + @Hidden @Argument(fullName="default_read_group", shortName="dRG", required=false, doc="If a read has no read group then default to the provided String.") public String DEFAULT_READ_GROUP = null; + @Hidden @Argument(fullName="default_platform", shortName="dP", required=false, doc="If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.") public String DEFAULT_PLATFORM = null; + @Hidden @Argument(fullName="force_read_group", shortName="fRG", required=false, doc="If provided, the read group ID of EVERY read will be forced to be the provided String. This is useful to collapse all data into a single read group.") public String FORCE_READ_GROUP = null; + @Hidden @Argument(fullName="force_platform", shortName="fP", required=false, doc="If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.") public String FORCE_PLATFORM = null; + @Hidden @Argument(fullName = "window_size_nqs", shortName="nqs", doc="The window size used by MinimumNQSCovariate for its calculation", required=false) public int WINDOW_SIZE = 5; /** * This window size tells the module in how big of a neighborhood around the current base it should look for the minimum base quality score. */ + @Hidden @Argument(fullName = "homopolymer_nback", shortName="nback", doc="The number of previous bases to look at in HomopolymerCovariate", required=false) public int HOMOPOLYMER_NBACK = 7; + @Hidden @Argument(fullName = "exception_if_no_tile", shortName="throwTileException", doc="If provided, TileCovariate will throw an exception when no tile can be found. The default behavior is to use tile = -1", required=false) public boolean EXCEPTION_IF_NO_TILE = false;