From 0a7137616c6cddaa39cac62f71fc276dcdef36f7 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 2 Mar 2012 09:11:59 -0500 Subject: [PATCH 1/7] Now converts gatkreports to properly typed R data types in gsa.read.gatkreport -- use the general function type.convert from read.table to automagically convert the string data to booleans, factors, and numeric types as appropriate. Vastly better than the previous behavior which only worked for numerics, in some cases. --- .../sting/utils/R/gsalib/R/gsa.read.gatkreport.R | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.gatkreport.R b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.gatkreport.R index 46bbf7eda..876cf5cbc 100644 --- a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.gatkreport.R +++ b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.gatkreport.R @@ -4,11 +4,9 @@ colnames(d) = tableHeader; for (i in 1:ncol(d)) { - v = suppressWarnings(as.numeric(d[,i])); - - if (length(na.omit(as.numeric(v))) == length(d[,i])) { - d[,i] = v; - } + # use the general type.convert infrastructure of read.table to convert column data to R types + v = type.convert(d[,i]) + d[,i] = v; } usedNames = ls(envir=tableEnv, pattern=tableName); From 1e07e97b589eb73cebe5052fb656b3c54295b618 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 2 Mar 2012 13:30:17 -0500 Subject: [PATCH 2/7] Optimization: create allele list just once, not for each genotype --- .../sting/gatk/walkers/annotator/RankSumTest.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java index 3f555f780..00968943d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java @@ -43,15 +43,15 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements Standar final ArrayList altQuals = new ArrayList(); if ( vc.isSNP() ) { + final List altAlleles = new ArrayList(); + for ( final Allele a : vc.getAlternateAlleles() ) + altAlleles.add(a.getBases()[0]); + for ( final Genotype genotype : genotypes.iterateInSampleNameOrder() ) { final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); if ( context == null ) continue; - final List altAlleles = new ArrayList(); - for ( final Allele a : vc.getAlternateAlleles() ) - altAlleles.add(a.getBases()[0]); - fillQualsFromPileup(ref.getBase(), altAlleles, context.getBasePileup(), refQuals, altQuals); } } else if ( vc.isIndel() || vc.isMixed() ) { From ba71b0aee4267be72ed32eaaa0b89cc6434f1a1a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 2 Mar 2012 14:51:23 -0500 Subject: [PATCH 4/7] ReadGroupProperties mk3 -- Includes sequencing date --- .../diagnostics/ReadGroupProperties.java | 44 ++++++++++--------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java index c192d04e7..2eb3b5e85 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java @@ -36,13 +36,14 @@ import org.broadinstitute.sting.utils.Median; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.io.PrintStream; +import java.text.DateFormat; import java.util.HashMap; import java.util.Map; /** - * Emits a GATKReport containing read group, sample, library, platform, center, paired end status, - * simple read type name (e.g. 2x76) median insert size and median read length for each read group - * in every provided BAM file + * Emits a GATKReport containing read group, sample, library, platform, center, sequencing data, + * paired end status, simple read type name (e.g. 2x76) median insert size and median read length + * for each read group in every provided BAM file * * Note that this walker stops when all read groups have been observed at least a few thousand times so that * the median statistics are well determined. It is safe to run it WG and it'll finish in an appropriate @@ -61,23 +62,23 @@ import java.util.Map; * *
  *      ##:GATKReport.v0.2 ReadGroupProperties : Table of read group properties
- *      readgroup  sample   library       platform  center  has.any.reads  is.paired.end  n.reads.analyzed  simple.read.type  median.read.length  median.insert.size
- *      20FUK.1    NA12878  Solexa-18483  illumina  BI      true           true                      10100  2x101                            101                 387
- *      20FUK.2    NA12878  Solexa-18484  illumina  BI      true           true                      10115  2x101                            101                 415
- *      20FUK.3    NA12878  Solexa-18483  illumina  BI      true           true                      10090  2x101                            101                 388
- *      20FUK.4    NA12878  Solexa-18484  illumina  BI      true           true                      10081  2x101                            101                 415
- *      20FUK.5    NA12878  Solexa-18483  illumina  BI      true           true                      10078  2x101                            101                 387
- *      20FUK.6    NA12878  Solexa-18484  illumina  BI      true           true                      10072  2x101                            101                 415
- *      20FUK.7    NA12878  Solexa-18483  illumina  BI      true           true                      10086  2x101                            101                 388
- *      20FUK.8    NA12878  Solexa-18484  illumina  BI      true           true                      10097  2x101                            101                 415
- *      20GAV.1    NA12878  Solexa-18483  illumina  BI      true           true                      10135  2x101                            101                 388
- *      20GAV.2    NA12878  Solexa-18484  illumina  BI      true           true                      10172  2x101                            101                 415
- *      20GAV.3    NA12878  Solexa-18483  illumina  BI      true           true                      10141  2x101                            101                 388
- *      20GAV.4    NA12878  Solexa-18484  illumina  BI      true           true                      10251  2x101                            101                 416
- *      20GAV.5    NA12878  Solexa-18483  illumina  BI      true           true                      10145  2x101                            101                 388
- *      20GAV.6    NA12878  Solexa-18484  illumina  BI      true           true                      10184  2x101                            101                 415
- *      20GAV.7    NA12878  Solexa-18483  illumina  BI      true           true                      10174  2x101                            101                 387
- *      20GAV.8    NA12878  Solexa-18484  illumina  BI      true           true                      10141  2x101                            101                 414
+ *      readgroup  sample   library       platform  center  date     has.any.reads  is.paired.end  n.reads.analyzed  simple.read.type  median.read.length  median.insert.size
+ *      20FUK.1    NA12878  Solexa-18483  illumina  BI      2/2/10   true           true                        498  2x101                            101                 386
+ *      20FUK.2    NA12878  Solexa-18484  illumina  BI      2/2/10   true           true                        476  2x101                            101                 417
+ *      20FUK.3    NA12878  Solexa-18483  illumina  BI      2/2/10   true           true                        407  2x101                            101                 387
+ *      20FUK.4    NA12878  Solexa-18484  illumina  BI      2/2/10   true           true                        389  2x101                            101                 415
+ *      20FUK.5    NA12878  Solexa-18483  illumina  BI      2/2/10   true           true                        433  2x101                            101                 386
+ *      20FUK.6    NA12878  Solexa-18484  illumina  BI      2/2/10   true           true                        480  2x101                            101                 418
+ *      20FUK.7    NA12878  Solexa-18483  illumina  BI      2/2/10   true           true                        450  2x101                            101                 386
+ *      20FUK.8    NA12878  Solexa-18484  illumina  BI      2/2/10   true           true                        438  2x101                            101                 418
+ *      20GAV.1    NA12878  Solexa-18483  illumina  BI      1/26/10  true           true                        490  2x101                            101                 391
+ *      20GAV.2    NA12878  Solexa-18484  illumina  BI      1/26/10  true           true                        485  2x101                            101                 417
+ *      20GAV.3    NA12878  Solexa-18483  illumina  BI      1/26/10  true           true                        460  2x101                            101                 392
+ *      20GAV.4    NA12878  Solexa-18484  illumina  BI      1/26/10  true           true                        434  2x101                            101                 415
+ *      20GAV.5    NA12878  Solexa-18483  illumina  BI      1/26/10  true           true                        479  2x101                            101                 389
+ *      20GAV.6    NA12878  Solexa-18484  illumina  BI      1/26/10  true           true                        461  2x101                            101                 416
+ *      20GAV.7    NA12878  Solexa-18483  illumina  BI      1/26/10  true           true                        509  2x101                            101                 386
+ *      20GAV.8    NA12878  Solexa-18484  illumina  BI      1/26/10  true           true                        476  2x101                            101                 410                           101                 414
  *      
*

* @@ -172,6 +173,7 @@ public class ReadGroupProperties extends ReadWalker { final GATKReport report = new GATKReport(); report.addTable(TABLE_NAME, "Table of read group properties"); GATKReportTable table = report.getTable(TABLE_NAME); + DateFormat dateFormatter = DateFormat.getDateInstance(DateFormat.SHORT); table.addPrimaryKey("readgroup"); //* Emits a GATKReport containing read group, sample, library, platform, center, median insert size and @@ -180,6 +182,7 @@ public class ReadGroupProperties extends ReadWalker { table.addColumn("library", "NA"); table.addColumn("platform", "NA"); table.addColumn("center", "NA"); + table.addColumn("date", "NA"); table.addColumn("has.any.reads", "false"); table.addColumn("is.paired.end", "false"); table.addColumn("n.reads.analyzed", "NA"); @@ -200,6 +203,7 @@ public class ReadGroupProperties extends ReadWalker { table.set(rgID, "library", rg.getLibrary()); table.set(rgID, "platform", rg.getPlatform()); table.set(rgID, "center", rg.getSequencingCenter()); + table.set(rgID, "date", dateFormatter.format(rg.getRunDate())); table.set(rgID, "has.any.reads", hasAnyReads); table.set(rgID, "is.paired.end", isPaired); table.set(rgID, "n.reads.analyzed", info.nReadsSeen); From 69611af7d303246b1087c5081aaeada66fbaded5 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 2 Mar 2012 18:53:45 -0500 Subject: [PATCH 6/7] Workaround for bug in Picard in ReadGroupProperties -- NPE caused when you call getRunDate on a read group without a date. --- .../diagnostics/ReadGroupProperties.java | 31 ++++++++++++------- .../ReadGroupPropertiesIntegrationTest.java | 2 +- 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java index 2eb3b5e85..d7a48d321 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java @@ -199,19 +199,28 @@ public class ReadGroupProperties extends ReadWalker { final boolean hasAnyReads = info.nReadsSeen > 0; final int readLength = info.readLength.getMedian(0); - table.set(rgID, "sample", rg.getSample()); - table.set(rgID, "library", rg.getLibrary()); - table.set(rgID, "platform", rg.getPlatform()); - table.set(rgID, "center", rg.getSequencingCenter()); - table.set(rgID, "date", dateFormatter.format(rg.getRunDate())); - table.set(rgID, "has.any.reads", hasAnyReads); - table.set(rgID, "is.paired.end", isPaired); - table.set(rgID, "n.reads.analyzed", info.nReadsSeen); - table.set(rgID, "simple.read.type", hasAnyReads ? String.format("%dx%d", isPaired ? 2 : 1, readLength) : "NA"); - table.set(rgID, "median.read.length", hasAnyReads ? readLength : "NA" ); - table.set(rgID, "median.insert.size", hasAnyReads && isPaired ? info.insertSize.getMedian(0) : "NA" ); + setTableValue(table, rgID, "sample", rg.getSample()); + setTableValue(table, rgID, "library", rg.getLibrary()); + setTableValue(table, rgID, "platform", rg.getPlatform()); + setTableValue(table, rgID, "center", rg.getSequencingCenter()); + try { + setTableValue(table, rgID, "date", rg.getRunDate() != null ? dateFormatter.format(rg.getRunDate()) : "NA"); + } catch ( NullPointerException e ) { + // TODO: remove me when bug in Picard is fixed that causes NPE when date isn't present + setTableValue(table, rgID, "date", "NA"); + } + setTableValue(table, rgID, "has.any.reads", hasAnyReads); + setTableValue(table, rgID, "is.paired.end", isPaired); + setTableValue(table, rgID, "n.reads.analyzed", info.nReadsSeen); + setTableValue(table, rgID, "simple.read.type", hasAnyReads ? String.format("%dx%d", isPaired ? 2 : 1, readLength) : "NA"); + setTableValue(table, rgID, "median.read.length", hasAnyReads ? readLength : "NA" ); + setTableValue(table, rgID, "median.insert.size", hasAnyReads && isPaired ? info.insertSize.getMedian(0) : "NA" ); } report.print(out); } + + private final void setTableValue(GATKReportTable table, final String rgID, final String key, final Object value) { + table.set(rgID, key, value == null ? "NA" : value); + } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupPropertiesIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupPropertiesIntegrationTest.java index 04c30a8fe..1a4c8db30 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupPropertiesIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupPropertiesIntegrationTest.java @@ -38,7 +38,7 @@ public class ReadGroupPropertiesIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T ReadGroupProperties -R " + b37KGReference + " -I " + b37GoodBAM + " -L 20:10,000,000-11,000,000 -o %s", 1, - Arrays.asList("8e4d09665c0b65c971bd5dead24f95fe")); + Arrays.asList("6b8cce223af28cbadcfe87a3b841fc56")); executeTest("ReadGroupProperties:", spec); } } \ No newline at end of file From 3b5a7c34d74518739ca588be62e21f591d053aa1 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Sun, 4 Mar 2012 10:24:29 -0500 Subject: [PATCH 7/7] Added argument to ValidationAmplicons to only output valid sequences - useful for not having to post-filter or grep resulting files before delivering downstream --- .../validation/ValidationAmplicons.java | 23 +++++++++++++------ 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java index b27bef265..e812fb53a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java @@ -110,6 +110,13 @@ public class ValidationAmplicons extends RodWalker { @Argument(doc="Lower case SNPs rather than replacing with 'N'",fullName="lowerCaseSNPs",required=false) boolean lowerCaseSNPs = false; + /** + * If onlyOutputValidAmplicons is true, the output fasta file will contain only valid sequences. + * Useful for producing delivery-ready files. + */ + @Argument(doc="Only output valid sequences.",fullName="onlyOutputValidAmplicons",required=false) + boolean onlyOutputValidAmplicons = false; + /** * BWA single-end alignment is used as a primer specificity proxy. Low-complexity regions (that don't align back to themselves as a best hit) are lowercased. * This changes the size of the k-mer used for alignment. @@ -486,14 +493,16 @@ public class ValidationAmplicons extends RodWalker { valid = "Valid"; } - String seqIdentity = sequence.toString().replace('n', 'N').replace('i', 'I').replace('d', 'D'); - if (!sequenomOutput) - out.printf(">%s %s %s%n%s%n", allelePos != null ? allelePos.toString() : "multiple", valid, probeName, seqIdentity); - else { - seqIdentity = seqIdentity.replace("*",""); // identifier < 20 letters long, no * in ref allele, one line per record - probeName = probeName.replace("amplicon_","a"); - out.printf("%s_%s %s%n", allelePos != null ? allelePos.toString() : "multiple", probeName, seqIdentity); + if (!onlyOutputValidAmplicons || !sequenceInvalid) { + String seqIdentity = sequence.toString().replace('n', 'N').replace('i', 'I').replace('d', 'D'); + if (!sequenomOutput) + out.printf(">%s %s %s%n%s%n", allelePos != null ? allelePos.toString() : "multiple", valid, probeName, seqIdentity); + else { + seqIdentity = seqIdentity.replace("*",""); // identifier < 20 letters long, no * in ref allele, one line per record + probeName = probeName.replace("amplicon_","a"); + out.printf("%s_%s %s%n", allelePos != null ? allelePos.toString() : "multiple", probeName, seqIdentity); + } } } }