changes in three files to make the HapMap RODs work:

- HapMapAlleleFrequenciesROD.java - the referenceOrderedDatum implementation
 - PrepareROD.java - has a static block that loads the known ROD classes, had to add the above
 - GenomeAnalysisTK.java - when supplied a hapmap argument... loads the ROD

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@265 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
kcibul 2009-04-02 19:55:19 +00:00
parent b4cdd1d9a1
commit c192a95998
3 changed files with 128 additions and 21 deletions

View File

@ -10,6 +10,7 @@ import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.gatk.refdata.rodGFF;
import org.broadinstitute.sting.gatk.refdata.HapMapAlleleFrequenciesROD;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.gatk.walkers.Walker;
@ -26,7 +27,6 @@ import java.io.File;
import java.io.PrintStream;
import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
public class GenomeAnalysisTK extends CommandLineProgram {
@ -83,7 +83,7 @@ public class GenomeAnalysisTK extends CommandLineProgram {
* Used by the walker.
*/
public PrintStream err = System.err;
/**
* our log, which we want to capture anything from this class
*/
@ -103,7 +103,7 @@ public class GenomeAnalysisTK extends CommandLineProgram {
m_parser.addOptionalArg("genome_region", "L", "Genome region to operation on: from chr:start-end", "REGION_STR");
m_parser.addRequiredArg("analysis_type", "T", "Type of analysis to run", "Analysis_Name");
m_parser.addOptionalArg("DBSNP", "D", "DBSNP file", "DBSNP_FILE");
m_parser.addOptionalArg("Hapmap", "H", "Hapmap file", "HAPMAP_FILE");
m_parser.addOptionalArg("hapmap", "H", "Hapmap file", "HAPMAP_FILE");
m_parser.addOptionalFlag("threaded_IO", "P", "If set, enables threaded I/O operations", "ENABLED_THREADED_IO");
m_parser.addOptionalFlag("unsafe", "U", "If set, enables unsafe operations, nothing will be checked at runtime.", "UNSAFE");
m_parser.addOptionalArg("sort_on_the_fly", "sort", "Maximum number of reads to sort on the fly", "MAX_ON_FLY_SORTS");
@ -113,7 +113,7 @@ public class GenomeAnalysisTK extends CommandLineProgram {
m_parser.addOptionalArg("all_loci", "A", "Should we process all loci, not just those covered by reads", "WALK_ALL_LOCI");
m_parser.addOptionalArg("out", "o", "An output file presented to the walker. Will overwrite contents if file exists.", "outFileName" );
m_parser.addOptionalArg("err", "e", "An error output file presented to the walker. Will overwrite contents if file exists.", "errFileName" );
m_parser.addOptionalArg("outerr", "oe", "A joint file for 'normal' and error output presented to the walker. Will overwrite contents if file exists.", "outErrFileName");
m_parser.addOptionalArg("outerr", "oe", "A joint file for 'normal' and error output presented to the walker. Will overwrite contents if file exists.", "outErrFileName");
}
/**
@ -142,7 +142,7 @@ public class GenomeAnalysisTK extends CommandLineProgram {
@Override
protected String getArgumentSourceName( Class argumentSource ) {
return WalkerManager.getWalkerName( (Class<Walker>)argumentSource );
return WalkerManager.getWalkerName( (Class<Walker>)argumentSource );
}
/**
@ -157,23 +157,27 @@ public class GenomeAnalysisTK extends CommandLineProgram {
final boolean TEST_ROD = false;
List<ReferenceOrderedData> rods = new ArrayList<ReferenceOrderedData>();
if (TEST_ROD) {
ReferenceOrderedData gff = new ReferenceOrderedData(new File("single.gff"), rodGFF.class);
if ( TEST_ROD ) {
ReferenceOrderedData gff = new ReferenceOrderedData(new File("trunk/data/gFFTest.gff"), rodGFF.class );
gff.testMe();
//ReferenceOrderedData dbsnp = new ReferenceOrderedData(new File("trunk/data/dbSNP_head.txt"), rodDbSNP.class );
ReferenceOrderedData dbsnp = new ReferenceOrderedData(new File("/Volumes/Users/mdepristo/broad/ATK/exampleSAMs/dbSNP_chr20.txt"), rodDbSNP.class);
ReferenceOrderedData dbsnp = new ReferenceOrderedData(new File("/Volumes/Users/mdepristo/broad/ATK/exampleSAMs/dbSNP_chr20.txt"), rodDbSNP.class );
//dbsnp.testMe();
rods.add(dbsnp); // { gff, dbsnp };
} else if (DBSNP_FILE != null) {
ReferenceOrderedData dbsnp = new ReferenceOrderedData(new File(DBSNP_FILE), rodDbSNP.class);
//dbsnp.testMe();
rods.add(dbsnp); // { gff, dbsnp };
}
if (HAPMAP_FILE != null) {
ReferenceOrderedData gff = new ReferenceOrderedData(new File(HAPMAP_FILE), rodGFF.class);
rods.add(gff);
} else {
if ( DBSNP_FILE != null ) {
ReferenceOrderedData dbsnp = new ReferenceOrderedData(new File(DBSNP_FILE), rodDbSNP.class );
//dbsnp.testMe();
rods.add(dbsnp); // { gff, dbsnp };
}
if ( HAPMAP_FILE != null ) {
ReferenceOrderedData hapmap = new ReferenceOrderedData(new File(HAPMAP_FILE), HapMapAlleleFrequenciesROD.class );
//dbsnp.testMe();
rods.add(hapmap); // { gff, dbsnp };
}
}
initializeOutputStreams();

View File

@ -0,0 +1,105 @@
package org.broadinstitute.sting.gatk.refdata;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Utils;
import java.util.List;
import java.util.Arrays;
import edu.mit.broad.picard.util.SequenceUtil;
/**
* ReferenceOrderedDatum class to hold HapMap AlleleFrequency Data
*/
public class HapMapAlleleFrequenciesROD extends ReferenceOrderedDatum {
public GenomeLoc loc; // genome location of SNP
// Reference sequence chromosome or scaffold
// Start and stop positions in chrom
public String rsNumber; // dbsnp rsNumber for this site
public String hgBuild;
public char Strand; // strand of the supplied alleles
public char refAllele;
public char varAllele;
public double refFreq;
public double varFreq;
public String strand; // maybe we don't need these?
public String alleles; // maybe we don't need these?
public Integer refCounts; // maybe we don't need these?
public Integer varCounts; // maybe we don't need these?
public Integer totalCounts; // maybe we don't need these?
public GenomeLoc getLocation() { return loc; }
public String toString() {
//rs11511647 HG18 chr10 62765 + T/C T C 21 97 0.178 0.822 118
return String.format(
"%s\t%s\t%s\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%1.3f\t%1.3f\t%d",
rsNumber, hgBuild, getContig(), getStart(), strand, alleles, refAllele, varAllele,
refCounts, varCounts, refFreq, varFreq, totalCounts);
}
public String toSimpleString() {
return String.format("%s:%s:%s:%1.3f", rsNumber, alleles, strand, varFreq);
}
public String repl() {
return toString();
}
public void parseLine(final String[] parts) {
try {
// rs11511647 <=> HG18 <=> chr10 <=> 62765 <=> + <=> T/C <=> T <=> C <=> 21 <=> 97 <=> 0.178 <=> 0.822 <=> 118
rsNumber = parts[0]; //rs#
hgBuild = parts[1]; // build
String contig = parts[2]; // chrom
long start = Long.parseLong(parts[3]); // The final is 1 based
long stop = start;
strand = parts[4]; // strand
alleles = parts[5]; //alleles
refAllele = parts[6].charAt(0); // ref_allele
varAllele = parts[7].charAt(0); // var_allele
refCounts = Integer.parseInt(parts[8]); // CEU_ref
varCounts = Integer.parseInt(parts[9]); // CEU_var
refFreq = Double.parseDouble(parts[10]); // CEU_ref_freq
varFreq = Double.parseDouble(parts[11]); // CEU_var_freq
totalCounts = Integer.parseInt(parts[12]); // CEU_var
loc = new GenomeLoc(contig, start, stop);
} catch ( RuntimeException e ) {
System.out.printf(" Exception caught during parsing HapMap Allele Freq %s%n", Utils.join(" <=> ", parts));
throw e;
}
}
public double getVarAlleleFreq() { return this.varFreq; }
public List<String> getAllelesFWD() {
List<String> alleleList;
if ( onFwdStrand() )
alleleList = Arrays.asList(alleles.split("/"));
else
alleleList = Arrays.asList(SequenceUtil.reverseComplement(alleles).split("/"));
return alleleList;
}
public boolean onFwdStrand() {
return strand.equals("+");
}
}

View File

@ -8,10 +8,7 @@ import edu.mit.broad.picard.reference.ReferenceSequenceFileFactory;
import edu.mit.broad.picard.reference.ReferenceSequenceFile;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.gatk.refdata.rodGFF;
import org.broadinstitute.sting.gatk.refdata.*;
import java.io.*;
import java.util.HashMap;
@ -36,6 +33,7 @@ public class PrepareROD extends CommandLineProgram {
static {
addModule("GFF", rodGFF.class);
addModule("dbSNP", rodDbSNP.class);
addModule("HapMapAlleleFrequencies", HapMapAlleleFrequenciesROD.class);
}
/** Required main method implementation. */