From 671330950d6c0249f92e37b0dd68dea58ced23e8 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Wed, 17 Aug 2011 15:46:31 -0400 Subject: [PATCH 1/7] Updated Beagle walker for gatkdocs format. Pushed unsupported, undocumented arguments to @Hidden --- .../beagle/BeagleOutputToVCFWalker.java | 37 +++++++++++---- .../beagle/ProduceBeagleInputWalker.java | 45 ++++++++++++++++--- 2 files changed, 67 insertions(+), 15 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java index 40e6748ed..aca176bc2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java @@ -48,6 +48,31 @@ import static java.lang.Math.log10; /** * Takes files produced by Beagle imputation engine and creates a vcf with modified annotations. + * + *

This walker is intended to be run after Beagle has successfully executed. The full calling sequence for using Beagle along with the GATK is:

+ * + *

1. Run ProduceBeagleInputWalker.

+ *

2. Run Beagle

+ *

3. Uncompress output files

+ *

4. Run BeagleOutputToVCFWalker.

+ * + * + * Note that this walker requires all input files produced by Beagle. + * + * + *

Example

+ *
+ *     java -Xmx4000m -jar dist/GenomeAnalysisTK.jar \
+ *      -R reffile.fasta -T BeagleOutputToVCF \
+ *      -B:variant,VCF input_vcf.vcf \
+ *      -B:beagleR2,BEAGLE /myrun.beagle_output.r2 \
+ *      -B:beaglePhased,BEAGLE /myrun.beagle_output.phased \
+ *      -B:beagleProbs,BEAGLE /myrun.beagle_output.gprobs \
+ *      --out output_vcf.vcf
+ *      
+ +

Note that Beagle produces some of these files compressed as .gz, so gunzip must be run on them before walker is run in order to decompress them

+ */ public class BeagleOutputToVCFWalker extends RodWalker { @@ -57,22 +82,18 @@ public class BeagleOutputToVCFWalker extends RodWalker { @Input(fullName="comp", shortName = "comp", doc="Comparison VCF file", required=false) public RodBinding comp; - @Input(fullName="beagleR2", shortName = "beagleR2", doc="VCF file", required=true) + @Input(fullName="beagleR2", shortName = "beagleR2", doc="Beagle-produced .r2 file containing R^2 values for all markers", required=true) public RodBinding beagleR2; - @Input(fullName="beagleProbs", shortName = "beagleProbs", doc="VCF file", required=true) + @Input(fullName="beagleProbs", shortName = "beagleProbs", doc="Beagle-produced .probs file containing posterior genotype probabilities", required=true) public RodBinding beagleProbs; - @Input(fullName="beaglePhased", shortName = "beaglePhased", doc="VCF file", required=true) + @Input(fullName="beaglePhased", shortName = "beaglePhased", doc="Beagle-produced .phased file containing phased genotypes", required=true) public RodBinding beaglePhased; - @Output(doc="File to which variants should be written",required=true) + @Output(doc="VCF File to which variants should be written",required=true) protected VCFWriter vcfWriter = null; - @Argument(fullName="output_file", shortName="output", doc="Please use --out instead" ,required=false) - @Deprecated - protected String oldOutputArg; - @Argument(fullName="dont_mark_monomorphic_sites_as_filtered", shortName="keep_monomorphic", doc="If provided, we won't filter sites that beagle tags as monomorphic. Useful for imputing a sample's genotypes from a reference panel" ,required=false) public boolean DONT_FILTER_MONOMORPHIC_SITES = false; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java index c1508cf83..6ac817555 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java @@ -48,19 +48,45 @@ import java.io.PrintStream; import java.util.*; /** - * Produces an input file to Beagle imputation engine, listing genotype likelihoods for each sample in input variant file + * Converts the input VCF into a format accepted by the Beagle imputation/analysis program. + *

+ * + *

Input

+ *

+ * A VCF with variants to convert to Beagle format + *

+ * + *

Outputs

+ *

+ * A single text file which can be fed to Beagle + *

+ *

+ * Optional: A file with a list of markers + *

+ * + *

Examples

+ *
+ *     java -Xmx2g -jar dist/GenomeAnalysisTK.jar -L 20 \
+ *      -R reffile.fasta -T ProduceBeagleInput \
+ *      -B:variant,VCF path_to_input_vcf/inputvcf.vcf -o path_to_beagle_output/beagle_output
+ * 
+ * */ + public class ProduceBeagleInputWalker extends RodWalker { @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); - @Input(fullName="validation", shortName = "validation", doc="Input VCF file", required=false) + @Hidden + @Input(fullName="validation", shortName = "validation", doc="Validation VCF file", required=false) public RodBinding validation; + @Output(doc="File to which BEAGLE input should be written",required=true) protected PrintStream beagleWriter = null; - @Output(doc="File to which BEAGLE markers should be written", shortName="markers", fullName = "markers", required = false) + @Hidden + @Output(doc="File to which BEAGLE markers should be written", shortName="markers", fullName = "markers", required = false) protected PrintStream markers = null; int markerCounter = 1; @@ -73,14 +99,19 @@ public class ProduceBeagleInputWalker extends RodWalker { @Argument(doc="VQSqual key", shortName = "vqskey", required=false) protected String VQSLOD_KEY = "VQSqual"; - @Argument(fullName = "inserted_nocall_rate", shortName = "nc_rate", doc = "Rate (0-1) at which genotype no-calls will be randomly inserted, for testing", required = false) + @Hidden + @Argument(fullName = "inserted_nocall_rate", shortName = "nc_rate", doc = "Rate (0-1) at which genotype no-calls will be randomly inserted, for testing", required = false) public double insertedNoCallRate = 0; - @Argument(fullName = "validation_genotype_ptrue", shortName = "valp", doc = "Flat probability to assign to validation genotypes. Will override GL field.", required = false) + @Hidden + @Argument(fullName = "validation_genotype_ptrue", shortName = "valp", doc = "Flat probability to assign to validation genotypes. Will override GL field.", required = false) public double validationPrior = -1.0; - @Argument(fullName = "validation_bootstrap", shortName = "bs", doc = "Proportion of records to be used in bootstrap set", required = false) + @Hidden + @Argument(fullName = "validation_bootstrap", shortName = "bs", doc = "Proportion of records to be used in bootstrap set", required = false) public double bootstrap = 0.0; - @Argument(fullName = "bootstrap_vcf",shortName = "bvcf", doc = "Output a VCF with the records used for bootstrapping filtered out", required = false) + @Hidden + @Argument(fullName = "bootstrap_vcf",shortName = "bvcf", doc = "Output a VCF with the records used for bootstrapping filtered out", required = false) VCFWriter bootstrapVCFOutput = null; + @Argument(fullName = "checkIsMaleOnChrX", shortName = "checkIsMaleOnChrX", doc = "Set to true when Beagle-ing chrX and want to ensure male samples don't have heterozygous calls.", required = false) public boolean CHECK_IS_MALE_ON_CHR_X = false; From 53006da9a55a58f1d7f7f66be3b0466848a6f5fc Mon Sep 17 00:00:00 2001 From: David Roazen Date: Wed, 17 Aug 2011 14:55:32 -0400 Subject: [PATCH 2/7] Improved descriptions for the SnpEff annotations in the VCF header (based on Eric's feedback). --- .../sting/gatk/walkers/annotator/SnpEff.java | 26 +++++++++---------- .../VariantAnnotatorIntegrationTest.java | 2 +- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java index fc5014885..350c683c2 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java @@ -161,19 +161,19 @@ public class SnpEff extends InfoFieldAnnotation implements ExperimentalAnnotatio public List getDescriptions() { return Arrays.asList( - new VCFInfoHeaderLine(GENE_ID_KEY, 1, VCFHeaderLineType.String, "Gene ID"), - new VCFInfoHeaderLine(GENE_NAME_KEY, 1, VCFHeaderLineType.String, "Gene name"), - new VCFInfoHeaderLine(TRANSCRIPT_ID_KEY, 1, VCFHeaderLineType.String, "Transcript ID"), - new VCFInfoHeaderLine(EXON_ID_KEY, 1, VCFHeaderLineType.String, "Exon ID"), - new VCFInfoHeaderLine(EXON_RANK_KEY, 1, VCFHeaderLineType.Integer, "Exon rank"), - new VCFInfoHeaderLine(WITHIN_NON_CODING_GENE_KEY, 0, VCFHeaderLineType.Flag, "If present, gene is non-coding"), - new VCFInfoHeaderLine(EFFECT_KEY, 1, VCFHeaderLineType.String, "One of the most high-impact effects across all transcripts at this site"), - new VCFInfoHeaderLine(EFFECT_IMPACT_KEY, 1, VCFHeaderLineType.String, "Impact of the effect " + Arrays.toString(SnpEffConstants.EffectImpact.values())), - new VCFInfoHeaderLine(EFFECT_EXTRA_INFORMATION_KEY, 1, VCFHeaderLineType.String, "Additional information about the effect"), - new VCFInfoHeaderLine(OLD_NEW_AA_KEY, 1, VCFHeaderLineType.String, "Old/New amino acid"), - new VCFInfoHeaderLine(OLD_NEW_CODON_KEY, 1, VCFHeaderLineType.String, "Old/New codon"), - new VCFInfoHeaderLine(CODON_NUM_KEY, 1, VCFHeaderLineType.Integer, "Codon number"), - new VCFInfoHeaderLine(CDS_SIZE_KEY, 1, VCFHeaderLineType.Integer, "CDS size") + new VCFInfoHeaderLine(GENE_ID_KEY, 1, VCFHeaderLineType.String, "Gene ID for the highest-impact effect resulting from the current variant"), + new VCFInfoHeaderLine(GENE_NAME_KEY, 1, VCFHeaderLineType.String, "Gene name for the highest-impact effect resulting from the current variant"), + new VCFInfoHeaderLine(TRANSCRIPT_ID_KEY, 1, VCFHeaderLineType.String, "Transcript ID for the highest-impact effect resulting from the current variant"), + new VCFInfoHeaderLine(EXON_ID_KEY, 1, VCFHeaderLineType.String, "Exon ID for the highest-impact effect resulting from the current variant"), + new VCFInfoHeaderLine(EXON_RANK_KEY, 1, VCFHeaderLineType.Integer, "Exon rank for the highest-impact effect resulting from the current variant"), + new VCFInfoHeaderLine(WITHIN_NON_CODING_GENE_KEY, 0, VCFHeaderLineType.Flag, "If this flag is present, the highest-impact effect resulting from the current variant is within a non-coding gene"), + new VCFInfoHeaderLine(EFFECT_KEY, 1, VCFHeaderLineType.String, "The highest-impact effect resulting from the current variant (or one of the highest-impact effects, if there is a tie)"), + new VCFInfoHeaderLine(EFFECT_IMPACT_KEY, 1, VCFHeaderLineType.String, "Impact of the highest-impact effect resulting from the current variant " + Arrays.toString(SnpEffConstants.EffectImpact.values())), + new VCFInfoHeaderLine(EFFECT_EXTRA_INFORMATION_KEY, 1, VCFHeaderLineType.String, "Additional information about the highest-impact effect resulting from the current variant"), + new VCFInfoHeaderLine(OLD_NEW_AA_KEY, 1, VCFHeaderLineType.String, "Old/New amino acid for the highest-impact effect resulting from the current variant"), + new VCFInfoHeaderLine(OLD_NEW_CODON_KEY, 1, VCFHeaderLineType.String, "Old/New codon for the highest-impact effect resulting from the current variant"), + new VCFInfoHeaderLine(CODON_NUM_KEY, 1, VCFHeaderLineType.Integer, "Codon number for the highest-impact effect resulting from the current variant"), + new VCFInfoHeaderLine(CDS_SIZE_KEY, 1, VCFHeaderLineType.Integer, "CDS size for the highest-impact effect resulting from the current variant") ); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java index f54bfa40c..832079807 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -133,7 +133,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { validationDataLocation + "1000G.exomes.vcf --snpEffFile " + validationDataLocation + "snpEff_1.9.6_1000G.exomes.vcf_hg37.61.out -L 1:26,000,000-26,500,000", 1, - Arrays.asList("c08648a078368c80530bff004b3157f1") + Arrays.asList("03eae1dab19a9358250890594bf53607") ); executeTest("Testing SnpEff annotations", spec); } From d170187896bc58200a2e827373046b03c4c260cd Mon Sep 17 00:00:00 2001 From: Matt Hanna Date: Wed, 17 Aug 2011 16:16:05 -0400 Subject: [PATCH 3/7] Disable optimization that increases marginal speed of the GATK slightly but can produce data loss in a narrow corner case where the BGZF block(s) locations and offsets in the last index bucket of contig n overlap exactly with the BGZF block locations and offset in the last index bucket of contig n+1. A proper fix that keeps the optimization has already been introduced into unstable, but disabling the optimization is a low risk way to make sure that users of stable experience no data loss. --- .../gatk/datasources/reads/LowMemoryIntervalSharder.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java index ba6321121..198f7d7d3 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java @@ -59,8 +59,8 @@ public class LowMemoryIntervalSharder implements Iterator { */ public FilePointer next() { FilePointer current = wrappedIterator.next(); - while(wrappedIterator.hasNext() && current.minus(wrappedIterator.peek()) == 0) - current = current.combine(parser,wrappedIterator.next()); + //while(wrappedIterator.hasNext() && current.minus(wrappedIterator.peek()) == 0) + // current = current.combine(parser,wrappedIterator.next()); return current; } From d59e6ed274c555fb4e8d8198449ec0a6eb90de41 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 17 Aug 2011 16:22:07 -0400 Subject: [PATCH 4/7] Fix for RefSeqCodec bug and better error messages -- RefSeqCodec bug: getFeatureClass() returned RefSeqCodec.class, not RefSeqFeature.class. Really should change this in Tribble to require Class to get compile time type checking -- Better error messages that actually list the available tribble types, when there's a type error --- .../commandline/ArgumentTypeDescriptor.java | 14 +- .../refdata/features/refseq/RefSeqCodec.java | 220 +++++++++--------- .../sting/utils/text/ListFileUtils.java | 4 +- 3 files changed, 119 insertions(+), 119 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java index d1d4ff914..02af884a2 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java +++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java @@ -373,16 +373,16 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor { if ( featureDescriptor != null ) { tribbleType = featureDescriptor.getName(); logger.warn("Dynamically determined type of " + file + " to be " + tribbleType); + } else { + throw new UserException.CommandLineException( + String.format("No tribble type was provided on the command line and the type of the file could not be determined dynamically. " + + "Please add an explicit type tag :TYPE listing the correct type from among the supported types: %s", + manager.userFriendlyListOfAvailableFeatures())); } } } } - if ( tribbleType == null ) // error handling - throw new UserException.CommandLineException( - String.format("Could not parse argument %s with value %s", - defaultDefinition.fullName, value)); - Constructor ctor = (makeRawTypeIfNecessary(type)).getConstructor(Class.class, String.class, String.class, String.class, Tags.class); Class parameterType = getParameterizedTypeClass(type); RodBinding result = (RodBinding)ctor.newInstance(parameterType, name, value, tribbleType, tags); @@ -395,8 +395,8 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor { value, source.field.getName())); } catch (Exception e) { throw new UserException.CommandLineException( - String.format("Failed to parse value %s for argument %s.", - value, source.field.getName())); + String.format("Failed to parse value %s for argument %s. Message: %s", + value, source.field.getName(), e.getMessage())); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/refseq/RefSeqCodec.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/features/refseq/RefSeqCodec.java index 89ee65532..cec40b5bd 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/refseq/RefSeqCodec.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/features/refseq/RefSeqCodec.java @@ -1,110 +1,110 @@ -package org.broadinstitute.sting.gatk.refdata.features.refseq; - -import org.apache.commons.io.filefilter.FalseFileFilter; -import org.broad.tribble.Feature; -import org.broad.tribble.TribbleException; -import org.broad.tribble.readers.LineReader; -import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.exceptions.UserException; - -import java.util.ArrayList; - -/** - * the ref seq codec - */ -public class RefSeqCodec implements ReferenceDependentFeatureCodec { - - /** - * The parser to use when resolving genome-wide locations. - */ - private GenomeLocParser genomeLocParser; - private boolean zero_coding_length_user_warned = false; - /** - * Set the parser to use when resolving genetic data. - * @param genomeLocParser The supplied parser. - */ - @Override - public void setGenomeLocParser(GenomeLocParser genomeLocParser) { - this.genomeLocParser = genomeLocParser; - } - - @Override - public Feature decodeLoc(String line) { - if (line.startsWith("#")) return null; - String fields[] = line.split("\t"); - if (fields.length < 3) throw new TribbleException("RefSeq (decodeLoc) : Unable to parse line -> " + line + ", we expected at least 3 columns, we saw " + fields.length); - String contig_name = fields[2]; - try { - return new RefSeqFeature(genomeLocParser.createGenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5]))); - } catch ( UserException.MalformedGenomeLoc e ) { - Utils.warnUser("RefSeq file is potentially incorrect, as some transcripts or exons have a negative length ("+fields[2]+")"); - return null; - } - } - - /** Fills this object from a text line in RefSeq (UCSC) text dump file */ - @Override - public RefSeqFeature decode(String line) { - if (line.startsWith("#")) return null; - String fields[] = line.split("\t"); - - // we reference postion 15 in the split array below, make sure we have at least that many columns - if (fields.length < 16) throw new TribbleException("RefSeq (decode) : Unable to parse line -> " + line + ", we expected at least 16 columns, we saw " + fields.length); - String contig_name = fields[2]; - RefSeqFeature feature = new RefSeqFeature(genomeLocParser.createGenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5]))); - - feature.setTranscript_id(fields[1]); - if ( fields[3].length()==1 && fields[3].charAt(0)=='+') feature.setStrand(1); - else if ( fields[3].length()==1 && fields[3].charAt(0)=='-') feature.setStrand(-1); - else throw new UserException.MalformedFile("Expected strand symbol (+/-), found: "+fields[3] + " for line=" + line); - - int coding_start = Integer.parseInt(fields[6])+1; - int coding_stop = Integer.parseInt(fields[7]); - - if ( coding_start > coding_stop ) { - if ( ! zero_coding_length_user_warned ) { - Utils.warnUser("RefSeq file contains transcripts with zero coding length. "+ - "Such transcripts will be ignored (this warning is printed only once)"); - zero_coding_length_user_warned = true; - } - return null; - } - - feature.setTranscript_interval(genomeLocParser.createGenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5]))); - feature.setTranscript_coding_interval(genomeLocParser.createGenomeLoc(contig_name, coding_start, coding_stop)); - feature.setGene_name(fields[12]); - String[] exon_starts = fields[9].split(","); - String[] exon_stops = fields[10].split(","); - String[] eframes = fields[15].split(","); - - if ( exon_starts.length != exon_stops.length ) - throw new UserException.MalformedFile("Data format error: numbers of exon start and stop positions differ for line=" + line); - if ( exon_starts.length != eframes.length ) - throw new UserException.MalformedFile("Data format error: numbers of exons and exon frameshifts differ for line=" + line); - - ArrayList exons = new ArrayList(exon_starts.length); - ArrayList exon_frames = new ArrayList(eframes.length); - - for ( int i = 0 ; i < exon_starts.length ; i++ ) { - exons.add(genomeLocParser.createGenomeLoc(contig_name, Integer.parseInt(exon_starts[i])+1, Integer.parseInt(exon_stops[i]) ) ); - exon_frames.add(Integer.decode(eframes[i])); - } - - feature.setExons(exons); - feature.setExon_frames(exon_frames); - return feature; - } - - @Override - public Object readHeader(LineReader reader) { - return null; - } - - @Override - public Class getFeatureType() { - return RefSeqCodec.class; - } -} +package org.broadinstitute.sting.gatk.refdata.features.refseq; + +import org.apache.commons.io.filefilter.FalseFileFilter; +import org.broad.tribble.Feature; +import org.broad.tribble.TribbleException; +import org.broad.tribble.readers.LineReader; +import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.exceptions.UserException; + +import java.util.ArrayList; + +/** + * the ref seq codec + */ +public class RefSeqCodec implements ReferenceDependentFeatureCodec { + + /** + * The parser to use when resolving genome-wide locations. + */ + private GenomeLocParser genomeLocParser; + private boolean zero_coding_length_user_warned = false; + /** + * Set the parser to use when resolving genetic data. + * @param genomeLocParser The supplied parser. + */ + @Override + public void setGenomeLocParser(GenomeLocParser genomeLocParser) { + this.genomeLocParser = genomeLocParser; + } + + @Override + public Feature decodeLoc(String line) { + if (line.startsWith("#")) return null; + String fields[] = line.split("\t"); + if (fields.length < 3) throw new TribbleException("RefSeq (decodeLoc) : Unable to parse line -> " + line + ", we expected at least 3 columns, we saw " + fields.length); + String contig_name = fields[2]; + try { + return new RefSeqFeature(genomeLocParser.createGenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5]))); + } catch ( UserException.MalformedGenomeLoc e ) { + Utils.warnUser("RefSeq file is potentially incorrect, as some transcripts or exons have a negative length ("+fields[2]+")"); + return null; + } + } + + /** Fills this object from a text line in RefSeq (UCSC) text dump file */ + @Override + public RefSeqFeature decode(String line) { + if (line.startsWith("#")) return null; + String fields[] = line.split("\t"); + + // we reference postion 15 in the split array below, make sure we have at least that many columns + if (fields.length < 16) throw new TribbleException("RefSeq (decode) : Unable to parse line -> " + line + ", we expected at least 16 columns, we saw " + fields.length); + String contig_name = fields[2]; + RefSeqFeature feature = new RefSeqFeature(genomeLocParser.createGenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5]))); + + feature.setTranscript_id(fields[1]); + if ( fields[3].length()==1 && fields[3].charAt(0)=='+') feature.setStrand(1); + else if ( fields[3].length()==1 && fields[3].charAt(0)=='-') feature.setStrand(-1); + else throw new UserException.MalformedFile("Expected strand symbol (+/-), found: "+fields[3] + " for line=" + line); + + int coding_start = Integer.parseInt(fields[6])+1; + int coding_stop = Integer.parseInt(fields[7]); + + if ( coding_start > coding_stop ) { + if ( ! zero_coding_length_user_warned ) { + Utils.warnUser("RefSeq file contains transcripts with zero coding length. "+ + "Such transcripts will be ignored (this warning is printed only once)"); + zero_coding_length_user_warned = true; + } + return null; + } + + feature.setTranscript_interval(genomeLocParser.createGenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5]))); + feature.setTranscript_coding_interval(genomeLocParser.createGenomeLoc(contig_name, coding_start, coding_stop)); + feature.setGene_name(fields[12]); + String[] exon_starts = fields[9].split(","); + String[] exon_stops = fields[10].split(","); + String[] eframes = fields[15].split(","); + + if ( exon_starts.length != exon_stops.length ) + throw new UserException.MalformedFile("Data format error: numbers of exon start and stop positions differ for line=" + line); + if ( exon_starts.length != eframes.length ) + throw new UserException.MalformedFile("Data format error: numbers of exons and exon frameshifts differ for line=" + line); + + ArrayList exons = new ArrayList(exon_starts.length); + ArrayList exon_frames = new ArrayList(eframes.length); + + for ( int i = 0 ; i < exon_starts.length ; i++ ) { + exons.add(genomeLocParser.createGenomeLoc(contig_name, Integer.parseInt(exon_starts[i])+1, Integer.parseInt(exon_stops[i]) ) ); + exon_frames.add(Integer.decode(eframes[i])); + } + + feature.setExons(exons); + feature.setExon_frames(exon_frames); + return feature; + } + + @Override + public Object readHeader(LineReader reader) { + return null; + } + + @Override + public Class getFeatureType() { + return RefSeqFeature.class; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/text/ListFileUtils.java b/public/java/src/org/broadinstitute/sting/utils/text/ListFileUtils.java index 61d53679a..b0e25e55b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/text/ListFileUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/text/ListFileUtils.java @@ -160,8 +160,8 @@ public class ListFileUtils { rodBinding.getName(), rodBinding.getTribbleType(), builderForValidation.userFriendlyListOfAvailableFeatures())); if ( ! rodBinding.getType().isAssignableFrom(descriptor.getFeatureClass()) ) throw new UserException.BadArgumentValue(rodBinding.getName(), - String.format("Field %s expected type %s, but the type of the input file provided on the command line was %s. Please make sure that you have provided the correct file type and/or that you are not binding your rod to a name matching one of the available types.", - rodBinding.getName(), rodBinding.getType(), descriptor.getName())); + String.format("Field %s expected type %s, but the type of the input file provided on the command line was %s producing %s. Please make sure that you have provided the correct file type and/or that you are not binding your rod to a name matching one of the available types.", + rodBinding.getName(), rodBinding.getType(), descriptor.getName(), descriptor.getFeatureClass())); rodBindings.add(triplet); From c193f52e5de69e8dd837f42aae299a31ed5b016e Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Wed, 17 Aug 2011 16:29:45 -0400 Subject: [PATCH 6/7] Fixed up examples: pasting from wiki still had old rod syntax --- .../gatk/walkers/beagle/BeagleOutputToVCFWalker.java | 10 +++++----- .../gatk/walkers/beagle/ProduceBeagleInputWalker.java | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java index aca176bc2..51fe543df 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java @@ -64,11 +64,11 @@ import static java.lang.Math.log10; *
  *     java -Xmx4000m -jar dist/GenomeAnalysisTK.jar \
  *      -R reffile.fasta -T BeagleOutputToVCF \
- *      -B:variant,VCF input_vcf.vcf \
- *      -B:beagleR2,BEAGLE /myrun.beagle_output.r2 \
- *      -B:beaglePhased,BEAGLE /myrun.beagle_output.phased \
- *      -B:beagleProbs,BEAGLE /myrun.beagle_output.gprobs \
- *      --out output_vcf.vcf
+ *      -V input_vcf.vcf \
+ *      -beagleR2:BEAGLE /myrun.beagle_output.r2 \
+ *      -beaglePhased:BEAGLE /myrun.beagle_output.phased \
+ *      -beagleProbs:BEAGLE /myrun.beagle_output.gprobs \
+ *      -o output_vcf.vcf
  *      

Note that Beagle produces some of these files compressed as .gz, so gunzip must be run on them before walker is run in order to decompress them

diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java index 6ac817555..07793fd7b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java @@ -68,7 +68,7 @@ import java.util.*; *
  *     java -Xmx2g -jar dist/GenomeAnalysisTK.jar -L 20 \
  *      -R reffile.fasta -T ProduceBeagleInput \
- *      -B:variant,VCF path_to_input_vcf/inputvcf.vcf -o path_to_beagle_output/beagle_output
+ *      -V path_to_input_vcf/inputvcf.vcf -o path_to_beagle_output/beagle_output
  * 
* */ From 81a792afeb6f6bcdf81437a58417217924aeb61b Mon Sep 17 00:00:00 2001 From: Matt Hanna Date: Wed, 17 Aug 2011 16:58:24 -0400 Subject: [PATCH 7/7] Reverting optimization disable in unstable. --- .../gatk/datasources/reads/LowMemoryIntervalSharder.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java index 198f7d7d3..ba6321121 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java @@ -59,8 +59,8 @@ public class LowMemoryIntervalSharder implements Iterator { */ public FilePointer next() { FilePointer current = wrappedIterator.next(); - //while(wrappedIterator.hasNext() && current.minus(wrappedIterator.peek()) == 0) - // current = current.combine(parser,wrappedIterator.next()); + while(wrappedIterator.hasNext() && current.minus(wrappedIterator.peek()) == 0) + current = current.combine(parser,wrappedIterator.next()); return current; }