From 0a7137616c6cddaa39cac62f71fc276dcdef36f7 Mon Sep 17 00:00:00 2001
From: Mark DePristo
Date: Fri, 2 Mar 2012 09:11:59 -0500
Subject: [PATCH 1/7] Now converts gatkreports to properly typed R data types
in gsa.read.gatkreport
-- use the general function type.convert from read.table to automagically convert the string data to booleans, factors, and numeric types as appropriate. Vastly better than the previous behavior which only worked for numerics, in some cases.
---
.../sting/utils/R/gsalib/R/gsa.read.gatkreport.R | 8 +++-----
1 file changed, 3 insertions(+), 5 deletions(-)
diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.gatkreport.R b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.gatkreport.R
index 46bbf7eda..876cf5cbc 100644
--- a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.gatkreport.R
+++ b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.gatkreport.R
@@ -4,11 +4,9 @@
colnames(d) = tableHeader;
for (i in 1:ncol(d)) {
- v = suppressWarnings(as.numeric(d[,i]));
-
- if (length(na.omit(as.numeric(v))) == length(d[,i])) {
- d[,i] = v;
- }
+ # use the general type.convert infrastructure of read.table to convert column data to R types
+ v = type.convert(d[,i])
+ d[,i] = v;
}
usedNames = ls(envir=tableEnv, pattern=tableName);
From 1e07e97b589eb73cebe5052fb656b3c54295b618 Mon Sep 17 00:00:00 2001
From: Eric Banks
Date: Fri, 2 Mar 2012 13:30:17 -0500
Subject: [PATCH 2/7] Optimization: create allele list just once, not for each
genotype
---
.../sting/gatk/walkers/annotator/RankSumTest.java | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java
index 3f555f780..00968943d 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java
@@ -43,15 +43,15 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements Standar
final ArrayList altQuals = new ArrayList();
if ( vc.isSNP() ) {
+ final List altAlleles = new ArrayList();
+ for ( final Allele a : vc.getAlternateAlleles() )
+ altAlleles.add(a.getBases()[0]);
+
for ( final Genotype genotype : genotypes.iterateInSampleNameOrder() ) {
final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName());
if ( context == null )
continue;
- final List altAlleles = new ArrayList();
- for ( final Allele a : vc.getAlternateAlleles() )
- altAlleles.add(a.getBases()[0]);
-
fillQualsFromPileup(ref.getBase(), altAlleles, context.getBasePileup(), refQuals, altQuals);
}
} else if ( vc.isIndel() || vc.isMixed() ) {
From ba71b0aee4267be72ed32eaaa0b89cc6434f1a1a Mon Sep 17 00:00:00 2001
From: Mark DePristo
Date: Fri, 2 Mar 2012 14:51:23 -0500
Subject: [PATCH 4/7] ReadGroupProperties mk3
-- Includes sequencing date
---
.../diagnostics/ReadGroupProperties.java | 44 ++++++++++---------
1 file changed, 24 insertions(+), 20 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java
index c192d04e7..2eb3b5e85 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java
@@ -36,13 +36,14 @@ import org.broadinstitute.sting.utils.Median;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.io.PrintStream;
+import java.text.DateFormat;
import java.util.HashMap;
import java.util.Map;
/**
- * Emits a GATKReport containing read group, sample, library, platform, center, paired end status,
- * simple read type name (e.g. 2x76) median insert size and median read length for each read group
- * in every provided BAM file
+ * Emits a GATKReport containing read group, sample, library, platform, center, sequencing data,
+ * paired end status, simple read type name (e.g. 2x76) median insert size and median read length
+ * for each read group in every provided BAM file
*
* Note that this walker stops when all read groups have been observed at least a few thousand times so that
* the median statistics are well determined. It is safe to run it WG and it'll finish in an appropriate
@@ -61,23 +62,23 @@ import java.util.Map;
*
*
* ##:GATKReport.v0.2 ReadGroupProperties : Table of read group properties
- * readgroup sample library platform center has.any.reads is.paired.end n.reads.analyzed simple.read.type median.read.length median.insert.size
- * 20FUK.1 NA12878 Solexa-18483 illumina BI true true 10100 2x101 101 387
- * 20FUK.2 NA12878 Solexa-18484 illumina BI true true 10115 2x101 101 415
- * 20FUK.3 NA12878 Solexa-18483 illumina BI true true 10090 2x101 101 388
- * 20FUK.4 NA12878 Solexa-18484 illumina BI true true 10081 2x101 101 415
- * 20FUK.5 NA12878 Solexa-18483 illumina BI true true 10078 2x101 101 387
- * 20FUK.6 NA12878 Solexa-18484 illumina BI true true 10072 2x101 101 415
- * 20FUK.7 NA12878 Solexa-18483 illumina BI true true 10086 2x101 101 388
- * 20FUK.8 NA12878 Solexa-18484 illumina BI true true 10097 2x101 101 415
- * 20GAV.1 NA12878 Solexa-18483 illumina BI true true 10135 2x101 101 388
- * 20GAV.2 NA12878 Solexa-18484 illumina BI true true 10172 2x101 101 415
- * 20GAV.3 NA12878 Solexa-18483 illumina BI true true 10141 2x101 101 388
- * 20GAV.4 NA12878 Solexa-18484 illumina BI true true 10251 2x101 101 416
- * 20GAV.5 NA12878 Solexa-18483 illumina BI true true 10145 2x101 101 388
- * 20GAV.6 NA12878 Solexa-18484 illumina BI true true 10184 2x101 101 415
- * 20GAV.7 NA12878 Solexa-18483 illumina BI true true 10174 2x101 101 387
- * 20GAV.8 NA12878 Solexa-18484 illumina BI true true 10141 2x101 101 414
+ * readgroup sample library platform center date has.any.reads is.paired.end n.reads.analyzed simple.read.type median.read.length median.insert.size
+ * 20FUK.1 NA12878 Solexa-18483 illumina BI 2/2/10 true true 498 2x101 101 386
+ * 20FUK.2 NA12878 Solexa-18484 illumina BI 2/2/10 true true 476 2x101 101 417
+ * 20FUK.3 NA12878 Solexa-18483 illumina BI 2/2/10 true true 407 2x101 101 387
+ * 20FUK.4 NA12878 Solexa-18484 illumina BI 2/2/10 true true 389 2x101 101 415
+ * 20FUK.5 NA12878 Solexa-18483 illumina BI 2/2/10 true true 433 2x101 101 386
+ * 20FUK.6 NA12878 Solexa-18484 illumina BI 2/2/10 true true 480 2x101 101 418
+ * 20FUK.7 NA12878 Solexa-18483 illumina BI 2/2/10 true true 450 2x101 101 386
+ * 20FUK.8 NA12878 Solexa-18484 illumina BI 2/2/10 true true 438 2x101 101 418
+ * 20GAV.1 NA12878 Solexa-18483 illumina BI 1/26/10 true true 490 2x101 101 391
+ * 20GAV.2 NA12878 Solexa-18484 illumina BI 1/26/10 true true 485 2x101 101 417
+ * 20GAV.3 NA12878 Solexa-18483 illumina BI 1/26/10 true true 460 2x101 101 392
+ * 20GAV.4 NA12878 Solexa-18484 illumina BI 1/26/10 true true 434 2x101 101 415
+ * 20GAV.5 NA12878 Solexa-18483 illumina BI 1/26/10 true true 479 2x101 101 389
+ * 20GAV.6 NA12878 Solexa-18484 illumina BI 1/26/10 true true 461 2x101 101 416
+ * 20GAV.7 NA12878 Solexa-18483 illumina BI 1/26/10 true true 509 2x101 101 386
+ * 20GAV.8 NA12878 Solexa-18484 illumina BI 1/26/10 true true 476 2x101 101 410 101 414
*
*
*
@@ -172,6 +173,7 @@ public class ReadGroupProperties extends ReadWalker {
final GATKReport report = new GATKReport();
report.addTable(TABLE_NAME, "Table of read group properties");
GATKReportTable table = report.getTable(TABLE_NAME);
+ DateFormat dateFormatter = DateFormat.getDateInstance(DateFormat.SHORT);
table.addPrimaryKey("readgroup");
//* Emits a GATKReport containing read group, sample, library, platform, center, median insert size and
@@ -180,6 +182,7 @@ public class ReadGroupProperties extends ReadWalker {
table.addColumn("library", "NA");
table.addColumn("platform", "NA");
table.addColumn("center", "NA");
+ table.addColumn("date", "NA");
table.addColumn("has.any.reads", "false");
table.addColumn("is.paired.end", "false");
table.addColumn("n.reads.analyzed", "NA");
@@ -200,6 +203,7 @@ public class ReadGroupProperties extends ReadWalker {
table.set(rgID, "library", rg.getLibrary());
table.set(rgID, "platform", rg.getPlatform());
table.set(rgID, "center", rg.getSequencingCenter());
+ table.set(rgID, "date", dateFormatter.format(rg.getRunDate()));
table.set(rgID, "has.any.reads", hasAnyReads);
table.set(rgID, "is.paired.end", isPaired);
table.set(rgID, "n.reads.analyzed", info.nReadsSeen);
From 69611af7d303246b1087c5081aaeada66fbaded5 Mon Sep 17 00:00:00 2001
From: Mark DePristo
Date: Fri, 2 Mar 2012 18:53:45 -0500
Subject: [PATCH 6/7] Workaround for bug in Picard in ReadGroupProperties
-- NPE caused when you call getRunDate on a read group without a date.
---
.../diagnostics/ReadGroupProperties.java | 31 ++++++++++++-------
.../ReadGroupPropertiesIntegrationTest.java | 2 +-
2 files changed, 21 insertions(+), 12 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java
index 2eb3b5e85..d7a48d321 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java
@@ -199,19 +199,28 @@ public class ReadGroupProperties extends ReadWalker {
final boolean hasAnyReads = info.nReadsSeen > 0;
final int readLength = info.readLength.getMedian(0);
- table.set(rgID, "sample", rg.getSample());
- table.set(rgID, "library", rg.getLibrary());
- table.set(rgID, "platform", rg.getPlatform());
- table.set(rgID, "center", rg.getSequencingCenter());
- table.set(rgID, "date", dateFormatter.format(rg.getRunDate()));
- table.set(rgID, "has.any.reads", hasAnyReads);
- table.set(rgID, "is.paired.end", isPaired);
- table.set(rgID, "n.reads.analyzed", info.nReadsSeen);
- table.set(rgID, "simple.read.type", hasAnyReads ? String.format("%dx%d", isPaired ? 2 : 1, readLength) : "NA");
- table.set(rgID, "median.read.length", hasAnyReads ? readLength : "NA" );
- table.set(rgID, "median.insert.size", hasAnyReads && isPaired ? info.insertSize.getMedian(0) : "NA" );
+ setTableValue(table, rgID, "sample", rg.getSample());
+ setTableValue(table, rgID, "library", rg.getLibrary());
+ setTableValue(table, rgID, "platform", rg.getPlatform());
+ setTableValue(table, rgID, "center", rg.getSequencingCenter());
+ try {
+ setTableValue(table, rgID, "date", rg.getRunDate() != null ? dateFormatter.format(rg.getRunDate()) : "NA");
+ } catch ( NullPointerException e ) {
+ // TODO: remove me when bug in Picard is fixed that causes NPE when date isn't present
+ setTableValue(table, rgID, "date", "NA");
+ }
+ setTableValue(table, rgID, "has.any.reads", hasAnyReads);
+ setTableValue(table, rgID, "is.paired.end", isPaired);
+ setTableValue(table, rgID, "n.reads.analyzed", info.nReadsSeen);
+ setTableValue(table, rgID, "simple.read.type", hasAnyReads ? String.format("%dx%d", isPaired ? 2 : 1, readLength) : "NA");
+ setTableValue(table, rgID, "median.read.length", hasAnyReads ? readLength : "NA" );
+ setTableValue(table, rgID, "median.insert.size", hasAnyReads && isPaired ? info.insertSize.getMedian(0) : "NA" );
}
report.print(out);
}
+
+ private final void setTableValue(GATKReportTable table, final String rgID, final String key, final Object value) {
+ table.set(rgID, key, value == null ? "NA" : value);
+ }
}
diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupPropertiesIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupPropertiesIntegrationTest.java
index 04c30a8fe..1a4c8db30 100644
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupPropertiesIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupPropertiesIntegrationTest.java
@@ -38,7 +38,7 @@ public class ReadGroupPropertiesIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T ReadGroupProperties -R " + b37KGReference + " -I " + b37GoodBAM + " -L 20:10,000,000-11,000,000 -o %s",
1,
- Arrays.asList("8e4d09665c0b65c971bd5dead24f95fe"));
+ Arrays.asList("6b8cce223af28cbadcfe87a3b841fc56"));
executeTest("ReadGroupProperties:", spec);
}
}
\ No newline at end of file
From 3b5a7c34d74518739ca588be62e21f591d053aa1 Mon Sep 17 00:00:00 2001
From: Guillermo del Angel
Date: Sun, 4 Mar 2012 10:24:29 -0500
Subject: [PATCH 7/7] Added argument to ValidationAmplicons to only output
valid sequences - useful for not having to post-filter or grep resulting
files before delivering downstream
---
.../validation/ValidationAmplicons.java | 23 +++++++++++++------
1 file changed, 16 insertions(+), 7 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java
index b27bef265..e812fb53a 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java
@@ -110,6 +110,13 @@ public class ValidationAmplicons extends RodWalker {
@Argument(doc="Lower case SNPs rather than replacing with 'N'",fullName="lowerCaseSNPs",required=false)
boolean lowerCaseSNPs = false;
+ /**
+ * If onlyOutputValidAmplicons is true, the output fasta file will contain only valid sequences.
+ * Useful for producing delivery-ready files.
+ */
+ @Argument(doc="Only output valid sequences.",fullName="onlyOutputValidAmplicons",required=false)
+ boolean onlyOutputValidAmplicons = false;
+
/**
* BWA single-end alignment is used as a primer specificity proxy. Low-complexity regions (that don't align back to themselves as a best hit) are lowercased.
* This changes the size of the k-mer used for alignment.
@@ -486,14 +493,16 @@ public class ValidationAmplicons extends RodWalker {
valid = "Valid";
}
- String seqIdentity = sequence.toString().replace('n', 'N').replace('i', 'I').replace('d', 'D');
- if (!sequenomOutput)
- out.printf(">%s %s %s%n%s%n", allelePos != null ? allelePos.toString() : "multiple", valid, probeName, seqIdentity);
- else {
- seqIdentity = seqIdentity.replace("*",""); // identifier < 20 letters long, no * in ref allele, one line per record
- probeName = probeName.replace("amplicon_","a");
- out.printf("%s_%s %s%n", allelePos != null ? allelePos.toString() : "multiple", probeName, seqIdentity);
+ if (!onlyOutputValidAmplicons || !sequenceInvalid) {
+ String seqIdentity = sequence.toString().replace('n', 'N').replace('i', 'I').replace('d', 'D');
+ if (!sequenomOutput)
+ out.printf(">%s %s %s%n%s%n", allelePos != null ? allelePos.toString() : "multiple", valid, probeName, seqIdentity);
+ else {
+ seqIdentity = seqIdentity.replace("*",""); // identifier < 20 letters long, no * in ref allele, one line per record
+ probeName = probeName.replace("amplicon_","a");
+ out.printf("%s_%s %s%n", allelePos != null ? allelePos.toString() : "multiple", probeName, seqIdentity);
+ }
}
}
}