removal of most of the old GATK ROD system; also a fix for -Dsingle so we can again run just a single unit or integration test (single tests in tribble can be run with the -DsingleTest option now). More to come.
*** Three integration tests had to change: *** RecalibarationWalkersIntegrationTest: One of the tests was using the interval as the snp track, and wasn't supplying a DbSNP track (for CountCovariates) SequenomValidationConverterIntegrationTest: relies on Plink ROD which we've removed. PileupWalkerIntegrationTest: we no longer have implicit interval tracks, so there isn't a rod name over the specified region. Otherwise the same result. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4292 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
c604ed9440
commit
782e0018e4
|
|
@ -30,9 +30,11 @@ import net.sf.picard.reference.ReferenceSequenceFile;
|
|||
import net.sf.samtools.*;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
|
||||
|
||||
import org.broadinstitute.sting.gatk.datasources.sample.Sample;
|
||||
import org.broadinstitute.sting.gatk.datasources.sample.SampleDataSource;
|
||||
import org.broadinstitute.sting.gatk.datasources.sample.SampleFileParser;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
|
@ -50,7 +52,6 @@ import org.broadinstitute.sting.gatk.filters.ReadGroupBlackListFilter;
|
|||
import org.broadinstitute.sting.gatk.io.OutputTracker;
|
||||
import org.broadinstitute.sting.gatk.io.stubs.*;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackManager;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.RMDIntervalGenerator;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
|
|
@ -381,12 +382,8 @@ public class GenomeAnalysisEngine {
|
|||
referenceDataSource = openReferenceSequenceFile(argCollection.referenceFile);
|
||||
|
||||
if (argCollection.DBSNPFile != null) bindConvenienceRods(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME, "dbsnp", argCollection.DBSNPFile);
|
||||
// TODO: The ROD iterator currently does not understand multiple intervals file. Fix this by cleaning the ROD system.
|
||||
if (argCollection.intervals != null && argCollection.intervals.size() == 1) {
|
||||
bindConvenienceRods("interval", "Intervals", argCollection.intervals.get(0).replaceAll(",", ""));
|
||||
}
|
||||
|
||||
RMDTrackManager manager = new RMDTrackManager();
|
||||
RMDTrackBuilder manager = new RMDTrackBuilder();
|
||||
List<RMDTrack> tracks = manager.getReferenceMetaDataSources(this,argCollection.RODBindings);
|
||||
validateSuppliedReferenceOrderedDataAgainstWalker(my_walker, tracks);
|
||||
|
||||
|
|
|
|||
|
|
@ -3,9 +3,8 @@ package org.broadinstitute.sting.gatk.datasources.simpleDataSources;
|
|||
import org.broad.tribble.FeatureSource;
|
||||
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
|
||||
import org.broadinstitute.sting.gatk.refdata.SeekableRODIterator;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.TribbleTrack;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.builders.TribbleRMDTrackBuilder;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.FeatureToGATKFeatureIterator;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.FlashBackIterator;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator;
|
||||
|
|
@ -51,7 +50,7 @@ public class ReferenceOrderedDataSource implements SimpleDataSource {
|
|||
public ReferenceOrderedDataSource( Walker walker, RMDTrack rod) {
|
||||
this.rod = rod;
|
||||
if (rod.supportsQuery())
|
||||
iteratorPool = new ReferenceOrderedQueryDataPool(new TribbleRMDTrackBuilder(), (TribbleTrack)rod);
|
||||
iteratorPool = new ReferenceOrderedQueryDataPool(new RMDTrackBuilder(),rod);
|
||||
else
|
||||
iteratorPool = new ReferenceOrderedDataPool( walker, rod );
|
||||
}
|
||||
|
|
@ -112,7 +111,7 @@ class ReferenceOrderedDataPool extends ResourcePool<LocationAwareSeekableRODIter
|
|||
private final RMDTrack rod;
|
||||
boolean flashbackData = false;
|
||||
public ReferenceOrderedDataPool( Walker walker, RMDTrack rod ) {
|
||||
if (walker instanceof ReadWalker) flashbackData = true; // && (rod.getType() != IntervalRod.class)
|
||||
if (walker instanceof ReadWalker) flashbackData = true;
|
||||
this.rod = rod;
|
||||
}
|
||||
|
||||
|
|
@ -184,9 +183,9 @@ class ReferenceOrderedQueryDataPool extends ResourcePool<FeatureSource, Location
|
|||
private final RMDTrack rod;
|
||||
|
||||
// our tribble track builder
|
||||
private final TribbleRMDTrackBuilder builder;
|
||||
private final RMDTrackBuilder builder;
|
||||
|
||||
public ReferenceOrderedQueryDataPool( TribbleRMDTrackBuilder builder, TribbleTrack rod ) {
|
||||
public ReferenceOrderedQueryDataPool( RMDTrackBuilder builder, RMDTrack rod ) {
|
||||
this.rod = rod;
|
||||
this.builder = builder;
|
||||
// a little bit of a hack, but it saves us from re-reading the index from the file
|
||||
|
|
|
|||
|
|
@ -1,46 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.refdata;
|
||||
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: mdepristo
|
||||
* Date: Feb 27, 2009
|
||||
* Time: 10:49:47 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public abstract class BasicReferenceOrderedDatum implements ReferenceOrderedDatum {
|
||||
protected String name;
|
||||
|
||||
public BasicReferenceOrderedDatum(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public String getName() { return this.name; }
|
||||
|
||||
public abstract boolean parseLine(final Object header, final String[] parts) throws IOException;
|
||||
|
||||
public abstract String toString();
|
||||
public String toSimpleString() { return toString(); }
|
||||
public String repl() { return this.toString(); }
|
||||
|
||||
public String delimiterRegex() {
|
||||
return "\t";
|
||||
}
|
||||
|
||||
public abstract GenomeLoc getLocation();
|
||||
|
||||
public int compareTo( ReferenceOrderedDatum that ) {
|
||||
//System.out.printf("Comparing %s to %s => %d%n", getLocation(), that.getLocation(), getLocation().compareTo(that.getLocation()));
|
||||
return getLocation().compareTo(that.getLocation());
|
||||
}
|
||||
|
||||
public Object initialize(final File source) throws FileNotFoundException {
|
||||
//System.out.printf("Initialize called with %s%n", source);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,40 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.refdata;
|
||||
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
public class IntervalRod extends BasicReferenceOrderedDatum {
|
||||
private GenomeLoc loc = null;
|
||||
|
||||
public IntervalRod(String name) {
|
||||
super(name);
|
||||
}
|
||||
|
||||
public IntervalRod(String name, GenomeLoc loc) {
|
||||
super(name);
|
||||
this.loc = loc;
|
||||
}
|
||||
|
||||
public GenomeLoc getLocation() { return loc; }
|
||||
|
||||
/** Required by ReferenceOrderedDatum interface; this method does nothing (always returns false),
|
||||
* since this rod provides its own iterator for reading underlying data files.
|
||||
*/
|
||||
public boolean parseLine(Object header, String[] parts) {
|
||||
return false; // this rod has its own iterator
|
||||
}
|
||||
|
||||
public String repl() {
|
||||
throw new ReviewedStingException("repl() is not implemented yet");
|
||||
}
|
||||
|
||||
public String toSimpleString() { return loc.toString(); }
|
||||
public String toString() { return toSimpleString(); }
|
||||
|
||||
public static IntervalRodIterator createIterator(String trackName, File f) throws IOException {
|
||||
return IntervalRodIterator.IntervalRodIteratorFromLocsFile(trackName, f);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,56 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.refdata;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.*;
|
||||
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalUtils;
|
||||
|
||||
public class IntervalRodIterator implements Iterator<IntervalRod> {
|
||||
//private List<GenomeLoc> locations = null;
|
||||
private Iterator<GenomeLoc> iter;
|
||||
private String trackName = null;
|
||||
|
||||
//public static RODIterator<IntervalRod> IntervalRodIteratorFromLocs(List<GenomeLoc> locs) {
|
||||
// IntervalRodIterator it = new IntervalRodIterator("interval", locs);
|
||||
// return new RODIterator<IntervalRod>(it);
|
||||
//}
|
||||
|
||||
public static IntervalRodIterator IntervalRodIteratorFromLocsFile(final String trackName, final File file) {
|
||||
//System.out.printf("Parsing %s for intervals %s%n", file, trackName);
|
||||
GenomeLocSortedSet locs = IntervalUtils.sortAndMergeIntervals(IntervalUtils.parseIntervalArguments(Collections.singletonList(file.getPath())),
|
||||
GenomeAnalysisEngine.instance.getArguments().intervalMerging);
|
||||
//System.out.printf(" => got %d entries %n", locs.size());
|
||||
return new IntervalRodIterator(trackName, locs);
|
||||
}
|
||||
|
||||
public IntervalRodIterator(String trackName, GenomeLocSortedSet locs) {
|
||||
this.trackName = trackName;
|
||||
//locations = locs;
|
||||
iter = locs.iterator();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return iter.hasNext();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the next element in the iteration.
|
||||
* @throws NoSuchElementException - iterator has no more elements.
|
||||
*/
|
||||
@Override
|
||||
public IntervalRod next() {
|
||||
if (!this.hasNext()) throw new NoSuchElementException("IntervalRodIterator next called on iterator with no more elements");
|
||||
IntervalRod r = new IntervalRod(trackName, iter.next());
|
||||
//System.out.printf("IntervalRod next is %s%n", r);
|
||||
return r;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
}
|
||||
|
|
@ -1,579 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.refdata;
|
||||
|
||||
import org.broad.tribble.util.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: chartl, ebanks
|
||||
*
|
||||
*/
|
||||
public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<PlinkRod> {
|
||||
|
||||
public static final String SEQUENOM_NO_CALL = "0";
|
||||
public static final String SEQUENOM_NO_BASE = "-";
|
||||
|
||||
private final Set<String> headerEntries = new HashSet<String>(Arrays.asList("#Family ID","Individual ID","Sex",
|
||||
"Paternal ID","Maternal ID","Phenotype", "FID","IID","PAT","MAT","SEX","PHENOTYPE","#Individual ID"));
|
||||
private final byte SNP_MAJOR_MODE = 1;
|
||||
|
||||
private PlinkVariantInfo currentVariant;
|
||||
private ListIterator<PlinkVariantInfo> variantIterator;
|
||||
|
||||
private PlinkFileType plinkFileType;
|
||||
|
||||
private List<String> sampleNames;
|
||||
|
||||
private String[] fileHeader;
|
||||
|
||||
public enum PlinkFileType {
|
||||
STANDARD_PED, RAW_PED, BINARY_PED
|
||||
}
|
||||
|
||||
public PlinkRod(String name) {
|
||||
super(name);
|
||||
}
|
||||
|
||||
public PlinkRod(String name, PlinkVariantInfo record, ListIterator<PlinkVariantInfo> iter, List<String> sampleNames) {
|
||||
super(name);
|
||||
currentVariant = record;
|
||||
variantIterator = iter;
|
||||
this.sampleNames = sampleNames;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object initialize(final File plinkFile) throws FileNotFoundException {
|
||||
if ( ! plinkFile.exists() ) {
|
||||
throw new FileNotFoundException("File "+plinkFile.getAbsolutePath()+" does not exist.");
|
||||
}
|
||||
|
||||
ArrayList<PlinkVariantInfo> variants = parsePlinkFile(plinkFile);
|
||||
if ( variants == null )
|
||||
throw new IllegalStateException("Error parsing Plink file");
|
||||
|
||||
variantIterator = variants.listIterator();
|
||||
return null;
|
||||
}
|
||||
|
||||
public static PlinkRod createIterator(String name, File file) {
|
||||
PlinkRod plink = new PlinkRod(name);
|
||||
try {
|
||||
plink.initialize(file);
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new ReviewedStingException("Unable to find file " + file);
|
||||
}
|
||||
return plink;
|
||||
}
|
||||
|
||||
private void assertNotNull() {
|
||||
if ( currentVariant == null ) {
|
||||
throw new IllegalStateException("Current variant information is null");
|
||||
}
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return variantIterator.hasNext();
|
||||
}
|
||||
|
||||
public PlinkRod next() {
|
||||
if ( !this.hasNext() )
|
||||
throw new NoSuchElementException("PlinkRod next called on iterator with no more elements");
|
||||
|
||||
// get the next record
|
||||
currentVariant = variantIterator.next();
|
||||
return new PlinkRod(name, currentVariant, variantIterator, sampleNames);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean parseLine(Object obj, String[] args) {
|
||||
throw new UnsupportedOperationException("PlinkRod does not support the parseLine method");
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("The remove operation is not supported for a PlinkRod");
|
||||
}
|
||||
|
||||
@Override
|
||||
public GenomeLoc getLocation() {
|
||||
assertNotNull();
|
||||
return currentVariant.getLocation();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
assertNotNull();
|
||||
return currentVariant.toString();
|
||||
}
|
||||
|
||||
public String getVariantName() {
|
||||
assertNotNull();
|
||||
return currentVariant.getName();
|
||||
|
||||
}
|
||||
|
||||
/* Get the mapping from sample name to genotypes (array of Alleles).
|
||||
* Important note: none of the Alleles returned here are annotated as reference
|
||||
* (since the rod doesn't know offhand what the reference allele is).
|
||||
*
|
||||
* @return mapping from sample name to genotype
|
||||
*/
|
||||
public Map<String, Allele[]> getGenotypes() {
|
||||
return currentVariant.getGenotypes();
|
||||
}
|
||||
|
||||
public List<String> getSampleNames() {
|
||||
return sampleNames;
|
||||
}
|
||||
|
||||
public boolean isIndel() {
|
||||
assertNotNull();
|
||||
return currentVariant.isIndel();
|
||||
}
|
||||
|
||||
public boolean isInsertion() {
|
||||
assertNotNull();
|
||||
return currentVariant.isInsertion();
|
||||
}
|
||||
|
||||
public int getLength() {
|
||||
assertNotNull();
|
||||
return currentVariant.getLength();
|
||||
}
|
||||
|
||||
|
||||
// AM I PARSING A TEXT OR A BINARY FILE ??
|
||||
|
||||
private ArrayList<PlinkVariantInfo> parsePlinkFile(File file) {
|
||||
String[] splitFileName = file.getName().split("\\.");
|
||||
String extension = splitFileName[splitFileName.length-1];
|
||||
if ( extension.equals("ped") || extension.equals("raw") ) {
|
||||
return parseTextFormattedPlinkFile(file);
|
||||
} else if ( extension.equals("bed") || extension.equals("bim") || extension.equals("fam") ) {
|
||||
plinkFileType = PlinkFileType.BINARY_PED;
|
||||
return parseBinaryFormattedPlinkFile(file);
|
||||
} else {
|
||||
System.out.println("Warning -- Plink file does not have a standard extension (ped/raw for text, bed/bim/fam for binary) -- assuming ped format");
|
||||
return parseTextFormattedPlinkFile(file);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/* *** *** *** *** *** ** *** ** *** ** *** ** *** ** ***
|
||||
* * PARSING STANDARD TEXT PED FILES * **
|
||||
* *** *** *** *** *** ** *** ** *** ** *** ** *** ** ***/
|
||||
|
||||
private ArrayList<PlinkVariantInfo> parseTextFormattedPlinkFile( File file ) {
|
||||
try {
|
||||
BufferedReader reader = new BufferedReader( new FileReader ( file ) );
|
||||
ArrayList<PlinkVariantInfo> seqVars = new ArrayList<PlinkVariantInfo>();
|
||||
int headerFieldCount = instantiateVariantListFromHeader(seqVars, reader.readLine());
|
||||
|
||||
sampleNames = new ArrayList<String>();
|
||||
|
||||
String line;
|
||||
do {
|
||||
line = reader.readLine();
|
||||
incorporateInfo(seqVars, line, headerFieldCount);
|
||||
} while ( line != null );
|
||||
|
||||
|
||||
java.util.Collections.sort(seqVars); // because the comparable uses the GenomeLoc comparable; these
|
||||
// are sorted in standard reference order
|
||||
|
||||
return seqVars;
|
||||
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new ReviewedStingException("File "+file.getAbsolutePath()+" could not be found. This was checked earlier. Should never happen.",e);
|
||||
} catch ( IOException e ) {
|
||||
throw new ReviewedStingException("Error reading file "+file.getAbsolutePath()+".",e);
|
||||
}
|
||||
}
|
||||
|
||||
private void incorporateInfo(List<PlinkVariantInfo> vars, String plinkLine, int headerFieldCount) {
|
||||
if ( plinkLine == null ) {
|
||||
return;
|
||||
}
|
||||
|
||||
if ( plinkFileType != PlinkFileType.STANDARD_PED )
|
||||
throw new ReviewedStingException("Plink file is likely of .raw or recoded format. Please use an uncoded .ped file.");
|
||||
|
||||
StringTokenizer st = new StringTokenizer(plinkLine, "\t");
|
||||
int offset = 0;
|
||||
String sample = st.nextToken();
|
||||
while ( ! fileHeader[offset].equals("Individual ID") && ! fileHeader[offset].equals("#Individual ID") ) {
|
||||
sample = st.nextToken(); // kill nonstandard tokens
|
||||
offset ++;
|
||||
}
|
||||
|
||||
sampleNames.add(sample);
|
||||
for (int i = offset+1; i < headerFieldCount ; i++)
|
||||
st.nextToken();
|
||||
|
||||
int snpNumber = 0;
|
||||
while ( snpNumber < vars.size() ) {
|
||||
vars.get(snpNumber++).addGenotypeEntry(st.nextToken().split("\\s+"));
|
||||
}
|
||||
}
|
||||
|
||||
private int instantiateVariantListFromHeader(ArrayList<PlinkVariantInfo> seqVars, String header) {
|
||||
// if the first line is not a comment (what we're used to seeing),
|
||||
// then it's the raw header (comes from de-binary-ing a .bed file)
|
||||
if ( !header.startsWith("#") )
|
||||
throw new ReviewedStingException("Plink file is likely of .raw or recoded format. Please use an uncoded .ped file.");
|
||||
|
||||
plinkFileType = PlinkFileType.STANDARD_PED;
|
||||
|
||||
String[] headerFields = header.split("\t");
|
||||
fileHeader = headerFields;
|
||||
int skippedFields = 0;
|
||||
for ( String field : headerFields ) {
|
||||
if ( headerEntries.contains(field) )
|
||||
skippedFields++;
|
||||
else
|
||||
// not a standard header, so a variant
|
||||
seqVars.add(new PlinkVariantInfo(field));
|
||||
}
|
||||
|
||||
return skippedFields;
|
||||
}
|
||||
|
||||
/* *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
|
||||
* * PARSING BINARY PLINK BED/BIM/FAM FILES * *
|
||||
* *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***/
|
||||
|
||||
private ArrayList<PlinkVariantInfo> parseBinaryFormattedPlinkFile(File file) {
|
||||
PlinkBinaryTrifecta binaryFiles = getPlinkBinaryTriplet(file);
|
||||
ArrayList<PlinkVariantInfo> parsedVariants = instantiateVariantsFromBimFile(binaryFiles.bimFile);
|
||||
sampleNames = getSampleNameOrderingFromFamFile(binaryFiles.famFile);
|
||||
ArrayList<PlinkVariantInfo> updatedVariants = getGenotypesFromBedFile(parsedVariants, binaryFiles.bedFile);
|
||||
|
||||
java.util.Collections.sort(updatedVariants);
|
||||
|
||||
return updatedVariants;
|
||||
}
|
||||
|
||||
private PlinkBinaryTrifecta getPlinkBinaryTriplet(File file) {
|
||||
// just gonna parse the name
|
||||
PlinkBinaryTrifecta trifecta = new PlinkBinaryTrifecta();
|
||||
String absolute_path = file.getAbsolutePath();
|
||||
String[] directory_tree = absolute_path.split("/");
|
||||
String file_name = directory_tree[directory_tree.length-1].split("\\.")[0];
|
||||
StringBuilder pathBuilder = new StringBuilder();
|
||||
for ( int i = 0; i < directory_tree.length - 1; i ++ ) {
|
||||
pathBuilder.append(String.format("%s/",directory_tree[i]));
|
||||
}
|
||||
String path = pathBuilder.toString();
|
||||
trifecta.bedFile = new File(path+file_name+".bed");
|
||||
trifecta.bimFile = new File(path+file_name+".bim");
|
||||
trifecta.famFile = new File(path+file_name+".fam");
|
||||
|
||||
return trifecta;
|
||||
|
||||
}
|
||||
|
||||
private ArrayList<PlinkVariantInfo> instantiateVariantsFromBimFile(File bimFile) {
|
||||
BufferedReader reader;
|
||||
try {
|
||||
reader = new BufferedReader( new FileReader( bimFile ));
|
||||
} catch ( FileNotFoundException e) {
|
||||
throw new ReviewedStingException("The SNP information file accompanying the binary ped file was not found (the .bim file). "+
|
||||
"Please ensure that it is in the same directory as the .bed and .fam files. The file we "+
|
||||
"Were looking for was "+bimFile.getAbsolutePath(),e);
|
||||
}
|
||||
|
||||
ArrayList<PlinkVariantInfo> variants = new ArrayList<PlinkVariantInfo>();
|
||||
|
||||
try {
|
||||
String line = reader.readLine();
|
||||
while ( line != null ) {
|
||||
String[] snpInfo = line.split("\\s+");
|
||||
PlinkVariantInfo variant = new PlinkVariantInfo(snpInfo[1]);
|
||||
variant.setGenomeLoc(GenomeLocParser.parseGenomeLoc(snpInfo[0],Long.valueOf(snpInfo[3]), Long.valueOf(snpInfo[3])));
|
||||
variant.setAlleles(snpInfo[4],snpInfo[5]);
|
||||
variants.add(variant);
|
||||
|
||||
line = reader.readLine();
|
||||
}
|
||||
} catch ( IOException e ) {
|
||||
throw new ReviewedStingException("There was an error reading the .bim file "+bimFile.getAbsolutePath(), e);
|
||||
}
|
||||
|
||||
return variants;
|
||||
}
|
||||
|
||||
private static ArrayList<String> getSampleNameOrderingFromFamFile(File famFile) {
|
||||
BufferedReader reader;
|
||||
try {
|
||||
reader = new BufferedReader( new FileReader( famFile ));
|
||||
} catch ( FileNotFoundException e) {
|
||||
throw new ReviewedStingException("The Family information file accompanying the binary ped file was not found (the .fam file). "+
|
||||
"Please ensure that it is in the same directory as the .bed and .bim files. The file we "+
|
||||
"Were looking for was "+famFile.getAbsolutePath(),e);
|
||||
}
|
||||
|
||||
ArrayList<String> sampleNames = new ArrayList<String>();
|
||||
|
||||
try {
|
||||
String line;
|
||||
do {
|
||||
line = reader.readLine();
|
||||
if ( line != null ) {
|
||||
sampleNames.add(line.split("\\s+")[1]);
|
||||
}
|
||||
} while ( line != null );
|
||||
} catch (IOException e) {
|
||||
throw new ReviewedStingException("There was an error reading the .fam file "+famFile.getAbsolutePath(),e);
|
||||
}
|
||||
|
||||
return sampleNames;
|
||||
}
|
||||
|
||||
private ArrayList<PlinkVariantInfo> getGenotypesFromBedFile(ArrayList<PlinkVariantInfo> variants, File bedFile) {
|
||||
FileInputStream inStream;
|
||||
try {
|
||||
inStream = new FileInputStream(bedFile);
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new ReviewedStingException("The Binary pedigree file file accompanying the family file was not found (the .bed file). "+
|
||||
"Please ensure that it is in the same directory as the .bim and .fam files. The file we "+
|
||||
"Were looking for was "+bedFile.getAbsolutePath(),e);
|
||||
}
|
||||
|
||||
try {
|
||||
byte genotype;
|
||||
long bytesRead = 0;
|
||||
int snpOffset = 0;
|
||||
int sampleOffset = 0;
|
||||
boolean snpMajorMode = true;
|
||||
do {
|
||||
genotype = (byte) inStream.read();
|
||||
bytesRead++;
|
||||
if ( genotype != -1 ) {
|
||||
if ( bytesRead > 3 ) {
|
||||
addGenotypeByte(genotype,variants,snpOffset,sampleOffset, snpMajorMode);
|
||||
|
||||
if ( snpMajorMode ) {
|
||||
sampleOffset = sampleOffset + 4;
|
||||
if ( sampleOffset > sampleNames.size() -1 ) {
|
||||
snpOffset ++;
|
||||
sampleOffset = 0;
|
||||
}
|
||||
} else {
|
||||
snpOffset = snpOffset + 4;
|
||||
if ( snpOffset > variants.size() -1 ) {
|
||||
sampleOffset ++;
|
||||
snpOffset = 0;
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
if ( bytesRead == 3) {
|
||||
snpMajorMode = genotype == SNP_MAJOR_MODE;
|
||||
}
|
||||
}
|
||||
}
|
||||
} while ( genotype != -1 );
|
||||
} catch ( IOException e) {
|
||||
throw new ReviewedStingException("Error reading binary ped file "+bedFile.getAbsolutePath(), e);
|
||||
}
|
||||
|
||||
return variants;
|
||||
}
|
||||
|
||||
private void addGenotypeByte(byte genotype, ArrayList<PlinkVariantInfo> variants, int snpOffset, int sampleOffset, boolean major) {
|
||||
// four genotypes encoded in this byte
|
||||
int[] genotypes = parseGenotypes(genotype);
|
||||
for ( int g : genotypes ) {
|
||||
variants.get(snpOffset).addBinaryGenotypeEntry(g);
|
||||
|
||||
if ( major ) {
|
||||
sampleOffset++;
|
||||
if ( sampleOffset > sampleNames.size()-1 ) {// using offsets for comparison; size 5 == offset 4
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
snpOffset++;
|
||||
if( snpOffset > variants.size()-1 ) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private int[] parseGenotypes(byte genotype) {
|
||||
int[] genotypes = new int[4];
|
||||
genotypes[0] = ( genotype & 3 );
|
||||
genotypes[1] = ( ( genotype & 12 ) >>> 2 );
|
||||
genotypes[2] = ( ( genotype & 48 ) >>> 4 );
|
||||
genotypes[3] = ( ( genotype & 192 ) >>> 6 );
|
||||
return genotypes;
|
||||
}
|
||||
|
||||
class PlinkVariantInfo implements Comparable {
|
||||
|
||||
private String variantName;
|
||||
private GenomeLoc loc;
|
||||
|
||||
// the list of genotypes in the same order as in sampleNames (using a map here is inefficient)
|
||||
private List<Allele[]> genotypes = new ArrayList<Allele[]>();
|
||||
|
||||
// map of Alleles seen (so that we can share Allele objects among samples)
|
||||
HashMap<String, Allele> alleles = new HashMap<String, Allele>(4);
|
||||
|
||||
// for indels
|
||||
private boolean isIndel = false;
|
||||
private boolean isInsertion = false;
|
||||
private int length = 1;
|
||||
|
||||
// for binary parsing
|
||||
private String locAllele1;
|
||||
private String locAllele2;
|
||||
|
||||
|
||||
public PlinkVariantInfo(String variantName) {
|
||||
this.variantName = variantName;
|
||||
parseName();
|
||||
}
|
||||
|
||||
public GenomeLoc getLocation() {
|
||||
return loc;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return variantName;
|
||||
}
|
||||
|
||||
public Map<String, Allele[]> getGenotypes() {
|
||||
Map<String, Allele[]> genotypeMap = new HashMap<String, Allele[]>();
|
||||
int index = 0;
|
||||
for ( Allele[] myAlleles : genotypes )
|
||||
genotypeMap.put(sampleNames.get(index++), myAlleles);
|
||||
return genotypeMap;
|
||||
}
|
||||
|
||||
public boolean isIndel() {
|
||||
return isIndel;
|
||||
}
|
||||
|
||||
public boolean isInsertion() {
|
||||
return isInsertion;
|
||||
}
|
||||
|
||||
public int getLength() {
|
||||
return length;
|
||||
}
|
||||
|
||||
public void setGenomeLoc(GenomeLoc loc) {
|
||||
this.loc = loc;
|
||||
}
|
||||
|
||||
private void parseName() {
|
||||
int chromIdx = variantName.indexOf("|c");
|
||||
if ( chromIdx == -1 )
|
||||
throw new IllegalArgumentException("Variant name " + variantName + " does not adhere to required convention (...|c...)");
|
||||
String[] pieces = variantName.substring(chromIdx+2).split("_");
|
||||
if ( pieces.length < 2 )
|
||||
throw new IllegalArgumentException("Variant name " + variantName + " does not adhere to required convention (...|c..._p...)");
|
||||
|
||||
String chrom = pieces[0].trim();
|
||||
if ( pieces[1].charAt(0) != 'p' )
|
||||
throw new IllegalArgumentException("Variant name " + variantName + " does not adhere to required convention (...|c..._p...)");
|
||||
|
||||
String pos = pieces[1].substring(1).trim();
|
||||
loc = GenomeLocParser.parseGenomeLoc(chrom+":"+pos);
|
||||
|
||||
if ( pieces.length > 2 && (pieces[2].startsWith("gI") || pieces[2].startsWith("gD")) ) {
|
||||
// it's an indel
|
||||
isIndel = true;
|
||||
isInsertion = pieces[2].startsWith("gI");
|
||||
try {
|
||||
// length of insertion on reference is still 1
|
||||
if ( !isInsertion )
|
||||
length = Integer.parseInt(pieces[2].substring(2));
|
||||
} catch (NumberFormatException e) {
|
||||
throw new IllegalArgumentException("Variant name " + variantName + " does not adhere to required convention (...|c..._p..._g[I/D][length])");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void setAlleles(String al1, String al2) {
|
||||
if ( al1.equals(PlinkRod.SEQUENOM_NO_CALL) ) {
|
||||
// encoding for a site at which no variants were detected
|
||||
locAllele1 = al2;
|
||||
} else {
|
||||
locAllele1 = al1;
|
||||
}
|
||||
locAllele2 = al2;
|
||||
}
|
||||
|
||||
public void addGenotypeEntry(String[] alleleStrings) {
|
||||
|
||||
Allele[] myAlleles = new Allele[2];
|
||||
|
||||
for (int i = 0; i < 2; i++) {
|
||||
if ( alleleStrings.length <= i ) {
|
||||
myAlleles[i] = null;
|
||||
continue;
|
||||
}
|
||||
|
||||
String alleleString = alleleStrings[i];
|
||||
|
||||
Allele allele;
|
||||
if ( alleles.containsKey(alleleString) ) {
|
||||
allele = alleles.get(alleleString);
|
||||
} else {
|
||||
if ( PlinkRod.SEQUENOM_NO_CALL.equals(alleleString) )
|
||||
allele = Allele.NO_CALL;
|
||||
else
|
||||
allele = Allele.create(alleleString);
|
||||
alleles.put(alleleString, allele);
|
||||
}
|
||||
|
||||
myAlleles[i] = allele;
|
||||
}
|
||||
|
||||
genotypes.add(myAlleles);
|
||||
}
|
||||
|
||||
public void addBinaryGenotypeEntry(int genoTYPE) {
|
||||
String[] alleleStr = new String[2];
|
||||
if ( genoTYPE == 0 ) {
|
||||
alleleStr[0] = locAllele1;
|
||||
alleleStr[1] = locAllele1;
|
||||
} else if (genoTYPE == 2) {
|
||||
alleleStr[0] = locAllele1;
|
||||
alleleStr[1] = locAllele2;
|
||||
} else if (genoTYPE == 3 ) {
|
||||
alleleStr[0] = locAllele2;
|
||||
alleleStr[1] = locAllele2;
|
||||
} else {
|
||||
alleleStr[0] = "0";
|
||||
alleleStr[1] = "0";
|
||||
}
|
||||
|
||||
addGenotypeEntry(alleleStr);
|
||||
}
|
||||
|
||||
public int compareTo(Object obj) {
|
||||
if ( ! ( obj instanceof PlinkVariantInfo) ) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
return loc.compareTo(((PlinkVariantInfo) obj).getLocation());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class PlinkBinaryTrifecta {
|
||||
|
||||
public PlinkBinaryTrifecta() {}
|
||||
|
||||
public File bedFile;
|
||||
public File bimFile;
|
||||
public File famFile;
|
||||
|
||||
}
|
||||
|
|
@ -1,358 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.refdata;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.regex.Matcher;
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
/**
|
||||
* Class for representing arbitrary reference ordered data sets
|
||||
*
|
||||
* User: mdepristo
|
||||
* Date: Feb 27, 2009
|
||||
* Time: 10:47:14 AM
|
||||
*
|
||||
* System for interacting with tabular formatted data of the following format:
|
||||
*
|
||||
* # comment line
|
||||
* # must include HEADER KEYWORD
|
||||
* HEADER COL1 ... COLN
|
||||
* chr:pos data1 ... dataN
|
||||
*
|
||||
* The system supports the rod interface. You can just access tabularRODs through the normal ROD system.
|
||||
*
|
||||
* You can also write your own files, as such:
|
||||
*
|
||||
* ArrayList<String> header = new ArrayList<String>(Arrays.asList("HEADER", "col1", "col2", "col3"));
|
||||
* assertTrue(TabularROD.headerString(header).equals("HEADER\tcol1\tcol2\tcol3"));
|
||||
* String rowData = String.format("%d %d %d", 1, 2, 3);
|
||||
* TabularROD row = new TabularROD("myName", header, GenomeLoc.parseGenomeLoc("chrM", 1), rowData.split(" "));
|
||||
* assertTrue(row.toString().equals("chrM:1\t1\t2\t3"));
|
||||
*/
|
||||
public class TabularROD extends BasicReferenceOrderedDatum implements Map<String, String> {
|
||||
private static Logger logger = Logger.getLogger(TabularROD.class);
|
||||
|
||||
protected GenomeLoc loc;
|
||||
private HashMap<String, String> attributes;
|
||||
private ArrayList<String> header;
|
||||
|
||||
public static String DEFAULT_DELIMITER = "\t";
|
||||
public static String DEFAULT_DELIMITER_REGEX = "\\s+";
|
||||
|
||||
public static String DELIMITER = DEFAULT_DELIMITER;
|
||||
public static String DELIMITER_REGEX = DEFAULT_DELIMITER_REGEX;
|
||||
|
||||
private static int MAX_LINES_TO_LOOK_FOR_HEADER = 1000;
|
||||
private static Pattern HEADER_PATTERN = Pattern.compile("^\\s*((HEADER)|(loc)|(rs#)).*");
|
||||
private static Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
|
||||
|
||||
private static int parsedRecords = 0;
|
||||
final static boolean printRecordsParsed = false;
|
||||
|
||||
/**
|
||||
* Set the global tabular ROD delimiter and the regex to split the delimiter.
|
||||
*
|
||||
* The delimiter to put between fields, while the regex is used to split lines
|
||||
*
|
||||
* @param delimiter
|
||||
* @param delimeterRegex
|
||||
*/
|
||||
public static void setDelimiter(final String delimiter, final String delimeterRegex) {
|
||||
DELIMITER = delimiter;
|
||||
DELIMITER_REGEX = delimeterRegex;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a parsable string representation for the
|
||||
* @param header
|
||||
*/
|
||||
public static String headerString(final ArrayList<String> header) {
|
||||
requireGoodHeader(header);
|
||||
return Utils.join(DELIMITER, header);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a comment line containing the *single line* string msg
|
||||
*
|
||||
* @param msg
|
||||
* @return
|
||||
*/
|
||||
public static String commentString(final String msg) {
|
||||
return "# " + msg;
|
||||
}
|
||||
|
||||
private static boolean headerIsGood(final ArrayList<String> header) {
|
||||
if ( header.size() == 0 ) return false;
|
||||
//if ( ! header.get(0).equals("HEADER") ) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
private static void requireGoodHeader(final ArrayList<String> header) {
|
||||
if ( ! headerIsGood(header) )
|
||||
throw new RuntimeException("Header must begin with HEADER keyword");
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
//
|
||||
// Constructors
|
||||
//
|
||||
// ----------------------------------------------------------------------
|
||||
public TabularROD(final String name) {
|
||||
super(name);
|
||||
attributes = new HashMap<String, String>();
|
||||
}
|
||||
|
||||
public TabularROD(final String name, ArrayList<String> header) {
|
||||
this(name);
|
||||
this.header = header;
|
||||
}
|
||||
|
||||
/**
|
||||
* Make a new TabularROD with name, using header columns header, at loc, without any bound data. Data
|
||||
* must be bound to each corresponding header[i] field before the object is really usable.
|
||||
*
|
||||
* @param name
|
||||
* @param header
|
||||
* @param loc
|
||||
*/
|
||||
public TabularROD(final String name, ArrayList<String> header, GenomeLoc loc) {
|
||||
this(name);
|
||||
this.header = header;
|
||||
this.loc = loc;
|
||||
requireGoodHeader(this.header);
|
||||
}
|
||||
|
||||
/**
|
||||
* Make a new TabularROD with name, using header columns header, at loc, with data associated with the
|
||||
* header columns. data and header are assumed to be in the same order, and bindings will be established
|
||||
* from header[i] = data[i]. The TabularROD at this stage can be printed, manipulated, it is considered
|
||||
* a full fledged, initialized object.
|
||||
*
|
||||
* @param name
|
||||
* @param header
|
||||
* @param loc
|
||||
* @param data
|
||||
*/
|
||||
public TabularROD(final String name, ArrayList<String> header, GenomeLoc loc, String[] data) {
|
||||
this(name, header, loc);
|
||||
|
||||
if ( header.size() != data.length + 1 )
|
||||
throw new RuntimeException(String.format("Incorrect tabular data format: header has %d columns but %d data elements were provided: %s",
|
||||
header.size(), data.length, Utils.join("\t", data)));
|
||||
for ( int i = 0; i < data.length; i++ ) {
|
||||
put(header.get(i+1), data[i]);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Walks through the source files looking for the header line, which it returns as a
|
||||
* list of strings.
|
||||
*
|
||||
* @param source
|
||||
* @return
|
||||
*/
|
||||
public Object initialize(final File source) throws FileNotFoundException {
|
||||
return readHeader(source);
|
||||
}
|
||||
|
||||
/**
|
||||
* By default, all records (i.e. RODs) must contain the same number of fields as specified
|
||||
* by the header. Subclasses should override this method to disable this requirement.
|
||||
*
|
||||
* @return true if incomplete records are allowed; false otherwise
|
||||
*/
|
||||
public boolean allowIncompleteRecords() {
|
||||
return false;
|
||||
}
|
||||
|
||||
public static ArrayList<String> readHeader(final File source) throws FileNotFoundException {
|
||||
ArrayList<String> header = null;
|
||||
int linesLookedAt = 0;
|
||||
XReadLines reader = new XReadLines(source);
|
||||
|
||||
for ( String line : reader ) {
|
||||
Matcher m = HEADER_PATTERN.matcher(line);
|
||||
if ( m.matches() ) {
|
||||
//System.out.printf("Found a header line: %s%n", line);
|
||||
header = new ArrayList<String>(Arrays.asList(line.split(DELIMITER_REGEX)));
|
||||
//System.out.printf("HEADER IS %s%n", Utils.join(":", header));
|
||||
}
|
||||
|
||||
if ( linesLookedAt++ > MAX_LINES_TO_LOOK_FOR_HEADER )
|
||||
break;
|
||||
}
|
||||
|
||||
// check that we found the header
|
||||
if ( header != null ) {
|
||||
logger.debug(String.format("HEADER IS %s%n", Utils.join(":", header)));
|
||||
} else {
|
||||
// use the indexes as the header fields
|
||||
logger.debug("USING INDEXES FOR ROD HEADER");
|
||||
// reset if necessary
|
||||
if ( !reader.hasNext() )
|
||||
reader = new XReadLines(source);
|
||||
header = new ArrayList<String>();
|
||||
int tokens = reader.next().split(DELIMITER_REGEX).length;
|
||||
for ( int i = 0; i < tokens; i++)
|
||||
header.add(Integer.toString(i));
|
||||
}
|
||||
|
||||
try {
|
||||
reader.close();
|
||||
} catch ( IOException e ) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
return header;
|
||||
}
|
||||
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
//
|
||||
// ROD accessors
|
||||
//
|
||||
// ----------------------------------------------------------------------
|
||||
public GenomeLoc getLocation() {
|
||||
if ( loc != null )
|
||||
return loc;
|
||||
String s = get(header.get(0));
|
||||
if ( s == null )
|
||||
return null;
|
||||
return GenomeLocParser.parseGenomeLoc(s);
|
||||
}
|
||||
|
||||
public ArrayList<String> getHeader() {
|
||||
return header;
|
||||
}
|
||||
|
||||
public String get(final Object key) {
|
||||
return attributes.get(key);
|
||||
}
|
||||
|
||||
public String put(final String key, final String object) {
|
||||
return attributes.put(key, object);
|
||||
}
|
||||
|
||||
public boolean containsKey(final Object key) {
|
||||
return attributes.containsKey(key);
|
||||
}
|
||||
|
||||
public HashMap<String,String> getAttributes() {
|
||||
return attributes;
|
||||
}
|
||||
|
||||
public String getAttributeString() {
|
||||
List<String> strings = new ArrayList<String>(attributes.size());
|
||||
for ( String key : header ) {
|
||||
if ( containsKey(key) ) { // avoid the header
|
||||
strings.add(this.get(key));
|
||||
//System.out.printf("Adding %s%n", this.get(key));
|
||||
}
|
||||
}
|
||||
return Utils.join("\t", strings);
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
//
|
||||
// map functions
|
||||
//
|
||||
// ----------------------------------------------------------------------
|
||||
public int size() { return attributes.size(); }
|
||||
public boolean isEmpty() { return attributes.isEmpty(); }
|
||||
public boolean containsValue(Object o) { return attributes.containsValue(o); }
|
||||
public String remove(Object o) { return attributes.remove(o); }
|
||||
public void clear() { attributes.clear(); }
|
||||
public java.util.Set<String> keySet() { return attributes.keySet(); }
|
||||
public java.util.Collection<String> values() { return attributes.values(); }
|
||||
|
||||
public void putAll(java.util.Map<? extends String, ? extends String> map) {
|
||||
attributes.putAll(map);
|
||||
}
|
||||
|
||||
public java.util.Set<java.util.Map.Entry<String,String>> entrySet() {
|
||||
return attributes.entrySet();
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
//
|
||||
// formatting
|
||||
//
|
||||
// ----------------------------------------------------------------------
|
||||
public String toString() {
|
||||
if ( loc != null )
|
||||
return String.format("%s\t%s", loc, getAttributeString());
|
||||
return String.format("%s", getAttributeString());
|
||||
}
|
||||
|
||||
/**
|
||||
* The delimiter regular expression that should be used to separate fields in data rows
|
||||
* and header.
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public String delimiterRegex() {
|
||||
return DELIMITER_REGEX;
|
||||
}
|
||||
|
||||
/**
|
||||
* Used by ROD management system to set the data in this ROD associated with a line in a rod
|
||||
*
|
||||
* @param headerObj
|
||||
* @param parts
|
||||
* @return
|
||||
* @throws IOException
|
||||
*/
|
||||
public boolean parseLine(final Object headerObj, final String[] parts) throws IOException {
|
||||
header = (ArrayList<String>)(headerObj);
|
||||
|
||||
//System.out.printf("parts [len=%d] is '%s'%n", parts.length, Utils.join(":", parts));
|
||||
|
||||
if ( parts.length == 0 || COMMENT_PATTERN.matcher(parts[0]).matches() || HEADER_PATTERN.matcher(parts[0]).matches() )
|
||||
return false;
|
||||
|
||||
if ( !allowIncompleteRecords() && header.size() != parts.length) {
|
||||
throw new IOException(String.format("Header length %d not equal to Tabular parts length %d", header.size(), parts.length));
|
||||
}
|
||||
|
||||
for ( int i = 0; i < parts.length; i++ ) {
|
||||
put(header.get(i), parts[i]);
|
||||
}
|
||||
|
||||
if ( printRecordsParsed ) System.out.printf("Parsed %d records %s%n", ++parsedRecords, this);
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
@ -42,10 +42,8 @@ public class VariantContextAdaptors {
|
|||
|
||||
static {
|
||||
adaptors.put(DbSNPFeature.class, new DBSnpAdaptor());
|
||||
adaptors.put(PlinkRod.class, new PlinkRodAdaptor());
|
||||
adaptors.put(HapMapFeature.class, new HapMapAdaptor());
|
||||
adaptors.put(GeliTextFeature.class, new GeliTextAdaptor());
|
||||
adaptors.put(rodGELI.class, new GeliAdaptor());
|
||||
adaptors.put(VariantContext.class, new VariantContextAdaptor());
|
||||
}
|
||||
|
||||
|
|
@ -134,96 +132,6 @@ public class VariantContextAdaptors {
|
|||
return new VCFHeader(hInfo == null ? new HashSet<VCFHeaderLine>() : hInfo, names);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Plink to VariantContext
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
private static class PlinkRodAdaptor extends VCAdaptor {
|
||||
// VariantContext convert(String name, Object input) {
|
||||
// return convert(name, input, null);
|
||||
// }
|
||||
|
||||
VariantContext convert(String name, Object input, ReferenceContext ref) {
|
||||
if ( ref == null )
|
||||
throw new UnsupportedOperationException("Conversion from Plink to VariantContext requires a reference context");
|
||||
|
||||
PlinkRod plink = (PlinkRod)input;
|
||||
|
||||
HashSet<Allele> VCAlleles = new HashSet<Allele>();
|
||||
Allele refAllele = determineRefAllele(plink, ref);
|
||||
VCAlleles.add(refAllele);
|
||||
|
||||
// mapping from Plink Allele to VC Allele (since the PlinkRod does not annotate any of the Alleles as reference)
|
||||
HashMap<Allele, Allele> plinkAlleleToVCAllele = new HashMap<Allele, Allele>();
|
||||
|
||||
Set<Genotype> genotypes = new HashSet<Genotype>();
|
||||
|
||||
Map<String, Allele[]> genotypeSets = plink.getGenotypes();
|
||||
|
||||
// We need to iterate through this list and recreate the Alleles since the
|
||||
// PlinkRod does not annotate any of the Alleles as reference.
|
||||
// for each sample...
|
||||
for ( Map.Entry<String, Allele[]> genotype : genotypeSets.entrySet() ) {
|
||||
ArrayList<Allele> myVCAlleles = new ArrayList<Allele>(2);
|
||||
|
||||
// for each allele...
|
||||
for ( Allele myPlinkAllele : genotype.getValue() ) {
|
||||
Allele VCAllele;
|
||||
if ( myPlinkAllele.isNoCall() ) {
|
||||
VCAllele = Allele.NO_CALL;
|
||||
} else {
|
||||
if ( plinkAlleleToVCAllele.containsKey(myPlinkAllele) ) {
|
||||
VCAllele = plinkAlleleToVCAllele.get(myPlinkAllele);
|
||||
} else {
|
||||
if ( !plink.isIndel() ) {
|
||||
VCAllele = Allele.create(myPlinkAllele.getBases(), refAllele.equals(myPlinkAllele, true));
|
||||
} else if ( myPlinkAllele.isNull() ) {
|
||||
VCAllele = Allele.create(Allele.NULL_ALLELE_STRING, plink.isInsertion());
|
||||
} else {
|
||||
VCAllele = Allele.create(myPlinkAllele.getBases(), !plink.isInsertion());
|
||||
}
|
||||
plinkAlleleToVCAllele.put(myPlinkAllele, VCAllele);
|
||||
VCAlleles.add(VCAllele);
|
||||
}
|
||||
}
|
||||
|
||||
myVCAlleles.add(VCAllele);
|
||||
}
|
||||
|
||||
// create the genotype
|
||||
genotypes.add(new Genotype(genotype.getKey(), myVCAlleles));
|
||||
}
|
||||
|
||||
// create the variant context
|
||||
try {
|
||||
GenomeLoc loc = GenomeLocParser.setStop(plink.getLocation(), plink.getLocation().getStop() + plink.getLength()-1);
|
||||
VariantContext vc = VariantContextUtils.toVC(plink.getVariantName(), loc, VCAlleles, genotypes);
|
||||
return vc;
|
||||
} catch (IllegalArgumentException e) {
|
||||
throw new IllegalArgumentException(e.getMessage() + "; please make sure that e.g. a sample isn't present more than one time in your ped file");
|
||||
}
|
||||
}
|
||||
|
||||
private Allele determineRefAllele(PlinkRod plink, ReferenceContext ref) {
|
||||
Allele refAllele;
|
||||
if ( !plink.isIndel() ) {
|
||||
refAllele = Allele.create(ref.getBase(), true);
|
||||
} else if ( plink.isInsertion() ) {
|
||||
refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true);
|
||||
} else {
|
||||
long maxLength = ref.getWindow().getStop() - ref.getLocus().getStop();
|
||||
if ( plink.getLength() > maxLength )
|
||||
throw new UnsupportedOperationException("Plink conversion currently can only handle indels up to length " + maxLength);
|
||||
refAllele = deletionAllele(ref, 1, plink.getLength());
|
||||
|
||||
}
|
||||
return refAllele;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// GELI to VariantContext
|
||||
|
|
@ -291,78 +199,6 @@ public class VariantContextAdaptors {
|
|||
}
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// GELI to VariantContext
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
private static class GeliAdaptor extends VCAdaptor {
|
||||
/**
|
||||
* convert to a Variant Context, given:
|
||||
* @param name the name of the ROD
|
||||
* @param input the Rod object, in this case a RodGeliText
|
||||
* @return a VariantContext object
|
||||
*/
|
||||
// VariantContext convert(String name, Object input) {
|
||||
// return convert(name, input, null);
|
||||
// }
|
||||
|
||||
/**
|
||||
* convert to a Variant Context, given:
|
||||
* @param name the name of the ROD
|
||||
* @param input the Rod object, in this case a RodGeliText
|
||||
* @param ref the reference context
|
||||
* @return a VariantContext object
|
||||
*/
|
||||
VariantContext convert(String name, Object input, ReferenceContext ref) {
|
||||
GenotypeLikelihoods geli = ((rodGELI)input).getGeliRecord();
|
||||
if ( ! Allele.acceptableAlleleBases(String.valueOf((char)geli.getReferenceBase()),true) )
|
||||
return null;
|
||||
Allele refAllele = Allele.create(String.valueOf((char)geli.getReferenceBase()), true);
|
||||
|
||||
// add the reference allele
|
||||
List<Allele> alleles = new ArrayList<Allele>();
|
||||
alleles.add(refAllele);
|
||||
|
||||
// add the two alt alleles
|
||||
if (!Allele.acceptableAlleleBases(String.valueOf((char) geli.getBestGenotype().getAllele1()),false)) {
|
||||
return null;
|
||||
}
|
||||
Allele allele1 = Allele.create(String.valueOf((char) geli.getBestGenotype().getAllele1()), false);
|
||||
if (!alleles.contains(allele1) && !allele1.equals(refAllele, true)) alleles.add(allele1);
|
||||
|
||||
// add the two alt alleles
|
||||
if (!Allele.acceptableAlleleBases(String.valueOf((char) geli.getBestGenotype().getAllele2()),false)) {
|
||||
return null;
|
||||
}
|
||||
Allele allele2 = Allele.create(String.valueOf((char) geli.getBestGenotype().getAllele2()), false);
|
||||
if (!alleles.contains(allele2) && !allele2.equals(refAllele, true)) alleles.add(allele2);
|
||||
|
||||
|
||||
Map<String, String> attributes = new HashMap<String, String>();
|
||||
Collection<Genotype> genotypes = new ArrayList<Genotype>();
|
||||
MutableGenotype call = new MutableGenotype(name, alleles);
|
||||
|
||||
// setup the genotype likelihoods
|
||||
double[] post = new double[10];
|
||||
String[] gTypes = {"AA", "AC", "AG", "AT", "CC", "CG", "CT", "GG", "GT", "TT"};
|
||||
for (int x = 0; x < 10; x++)
|
||||
post[x] = Double.valueOf(geli.getLikelihood(DiploidGenotype.fromBases((byte) gTypes[x].charAt(0), (byte) gTypes[x].charAt(1))));
|
||||
|
||||
// set the likelihoods, depth, and RMS mapping quality values
|
||||
//call.putAttribute(CalledGenotype.POSTERIORS_ATTRIBUTE_KEY, post);
|
||||
//call.putAttribute(GeliTextWriter.MAXIMUM_MAPPING_QUALITY_ATTRIBUTE_KEY, (int) geli.getMaxMappingQuality());
|
||||
//call.putAttribute(GeliTextWriter.READ_COUNT_ATTRIBUTE_KEY, geli.getNumReads());
|
||||
|
||||
// add the call to the genotype list, and then use this list to create a VariantContext
|
||||
genotypes.add(call);
|
||||
VariantContext vc = VariantContextUtils.toVC(name, ((rodGELI) input).getLocation(), alleles, genotypes, geli.getBestToReferenceLod(), null, attributes);
|
||||
return vc;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// HapMap to VariantContext
|
||||
|
|
|
|||
|
|
@ -0,0 +1,63 @@
|
|||
package org.broadinstitute.sting.gatk.refdata.features.table;
|
||||
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broad.tribble.FeatureCodec;
|
||||
import org.broad.tribble.readers.LineReader;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* implementation of a simple table (tab or comma delimited format) input files... more improvements to come
|
||||
*/
|
||||
public class TableCodec implements FeatureCodec {
|
||||
private String delimiterRegex = "\\s+";
|
||||
private String headerDelimiter = "HEADER";
|
||||
private String commentDelimiter = "#";
|
||||
private ArrayList<String> header = new ArrayList<String>();
|
||||
|
||||
@Override
|
||||
public Feature decodeLoc(String line) {
|
||||
return decode(line);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Feature decode(String line) {
|
||||
if (line.startsWith(headerDelimiter) || line.startsWith(commentDelimiter))
|
||||
return null;
|
||||
String[] split = line.split(delimiterRegex);
|
||||
if (split.length < 1)
|
||||
throw new IllegalArgumentException("TableCodec line = " + line + " doesn't appear to be a valid table format");
|
||||
|
||||
|
||||
return new TableFeature(GenomeLocParser.parseGenomeLoc(split[0]),Arrays.asList(split),header);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Class getFeatureType() {
|
||||
return TableFeature.class;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object readHeader(LineReader reader) {
|
||||
String line = "";
|
||||
try {
|
||||
while ((line = reader.readLine()) != null) {
|
||||
if (line.startsWith(headerDelimiter)) {
|
||||
if (header.size() > 0) throw new IllegalStateException("Input table file seems to have two header lines. The second is = " + line);
|
||||
String spl[] = line.split(delimiterRegex);
|
||||
for (String s : spl) header.add(s);
|
||||
} else if (!line.startsWith(commentDelimiter)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new UserException.MalformedFile("unable to parse header from TableCodec file",e);
|
||||
}
|
||||
return header;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,56 @@
|
|||
package org.broadinstitute.sting.gatk.refdata.features.table;
|
||||
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* A feature representing a single row out of a text table
|
||||
*/
|
||||
public class TableFeature implements Feature {
|
||||
// stores the values for the columns seperated out
|
||||
private final List<String> values;
|
||||
|
||||
// if we have column names, we store them here
|
||||
private final List<String> keys;
|
||||
|
||||
// our location
|
||||
private final GenomeLoc position;
|
||||
|
||||
public TableFeature(GenomeLoc position, List<String> values, List<String> keys) {
|
||||
this.values = values;
|
||||
this.keys = keys;
|
||||
this.position = position;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getChr() {
|
||||
return position.getContig();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getStart() {
|
||||
return (int)position.getStart();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getEnd() {
|
||||
return (int)position.getStop();
|
||||
}
|
||||
|
||||
public String getValue(int columnPosition) {
|
||||
if (columnPosition >= values.size()) throw new IllegalArgumentException("We only have " + values.size() + "columns, the requested column = " + columnPosition);
|
||||
return values.get(columnPosition);
|
||||
}
|
||||
|
||||
public String get(String columnName) {
|
||||
int position = keys.indexOf(columnName);
|
||||
if (position < 0) throw new IllegalArgumentException("We don't have a column named " + columnName);
|
||||
return values.get(position);
|
||||
}
|
||||
|
||||
public GenomeLoc getLocation() {
|
||||
return this.position;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,162 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.refdata;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.IOException;
|
||||
import java.io.File;
|
||||
|
||||
import edu.mit.broad.picard.genotype.geli.GeliFileReader;
|
||||
import edu.mit.broad.picard.genotype.geli.GenotypeLikelihoods;
|
||||
import edu.mit.broad.picard.genotype.DiploidGenotype;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
|
||||
import net.sf.samtools.util.CloseableIterator;
|
||||
|
||||
|
||||
/**
|
||||
* This class wraps Picard Geli CHiP data and presents it as a ROD.
|
||||
*/
|
||||
|
||||
public class rodGELI extends BasicReferenceOrderedDatum {
|
||||
// ----------------------------------------------------------------------
|
||||
//
|
||||
// Constructors
|
||||
//
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
private GenotypeLikelihoods gh = null;
|
||||
|
||||
public rodGELI(final String name, GenotypeLikelihoods gh) {
|
||||
super(name);
|
||||
this.gh = gh;
|
||||
}
|
||||
|
||||
@Override
|
||||
public GenomeLoc getLocation() {
|
||||
return GenomeLocParser.createGenomeLoc(gh.getSequenceIndex(), gh.getPosition());
|
||||
}
|
||||
|
||||
public GenotypeLikelihoods getGeliRecord() {
|
||||
return gh;
|
||||
}
|
||||
|
||||
/** Required by ReferenceOrderedDatum interface. This implementation provides its own iterator,
|
||||
* so this method does nothing at all (always returns false).
|
||||
*
|
||||
*/
|
||||
@Override
|
||||
public boolean parseLine(Object header, String[] parts) throws IOException {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
final StringBuilder builder = new StringBuilder();
|
||||
builder.append(gh.getSequenceName() + "\t");
|
||||
builder.append(gh.getPosition() + "\t");
|
||||
builder.append(Character.toString((char)(gh.getReferenceBase() & 0xff)) + "\t");
|
||||
builder.append(gh.getNumReads() + "\t");
|
||||
builder.append(gh.getMaxMappingQuality() + "\t");
|
||||
builder.append(gh.getBestGenotype().name() + "\t");
|
||||
builder.append(gh.getBestToReferenceLod() + "\t");
|
||||
builder.append(gh.getBestToSecondBestLod() + "\t");
|
||||
builder.append("\t"); // no dbSNP info in GenotypeLikelihoods class
|
||||
for (final DiploidGenotype genotype : DiploidGenotype.values())
|
||||
builder.append(gh.getLikelihood(genotype) + "\t");
|
||||
builder.append("\n");
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
private static class rodGELIIterator implements Iterator<rodGELI> {
|
||||
|
||||
private String rodName = null;
|
||||
private GeliFileReader parser = null;
|
||||
private CloseableIterator<GenotypeLikelihoods> iterator = null;
|
||||
|
||||
rodGELIIterator(String name, File f) {
|
||||
rodName = name;
|
||||
parser = new GeliFileReader(f);
|
||||
iterator = parser.iterator();
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return iterator.hasNext();
|
||||
}
|
||||
/**
|
||||
* @return the next element in the iteration.
|
||||
* @throws NoSuchElementException - iterator has no more elements.
|
||||
*/
|
||||
public rodGELI next() {
|
||||
if (!this.hasNext()) throw new NoSuchElementException("RodGELI next called on iterator with no more elements");
|
||||
return new rodGELI(rodName, iterator.next());
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("'remove' operation is not supported for GELIs");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static Iterator<rodGELI> createIterator(String name, File file) {
|
||||
return new rodGELI.rodGELIIterator(name,file);
|
||||
}
|
||||
|
||||
/****
|
||||
* We no longer want to use this class as truth data (sigh)
|
||||
*
|
||||
public List<String> getFWDAlleles() {
|
||||
return new ArrayList<String>();
|
||||
}
|
||||
|
||||
public String getFWDRefBases() {
|
||||
return "";
|
||||
}
|
||||
|
||||
public char getRef() {
|
||||
return (char)(gh.getReferenceBase() & 0xff);
|
||||
}
|
||||
|
||||
public boolean isPointGenotype() { return true; }
|
||||
public boolean isIndelGenotype() { return false; }
|
||||
public boolean isSNP() { return true; }
|
||||
public boolean isReference() { return gh.isHomozygousReference(); }
|
||||
public boolean isInsertion() { return false; }
|
||||
public boolean isDeletion() { return false; }
|
||||
public boolean isIndel() { return false; }
|
||||
public boolean isBiallelic() { return false; }
|
||||
public boolean isHom() { return gh.isHomozyous(); }
|
||||
public boolean isHet() { return !isHom(); }
|
||||
|
||||
public double getVariantConfidence() {
|
||||
return gh.getBestToReferenceLod();
|
||||
}
|
||||
|
||||
public double getConsensusConfidence() {
|
||||
return gh.getBestToSecondBestLod();
|
||||
}
|
||||
*
|
||||
*/
|
||||
|
||||
public static void main(String argv[]) {
|
||||
String testFile = "NA12878.geli";
|
||||
|
||||
Iterator<rodGELI> it = createIterator("test-geli", new File(testFile));
|
||||
|
||||
net.sf.picard.reference.ReferenceSequenceFileWalker reference = new net.sf.picard.reference.ReferenceSequenceFileWalker(new File( "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"));
|
||||
|
||||
if ( reference.getSequenceDictionary() == null ) {
|
||||
System.out.println("No reference sequence dictionary found. Abort.");
|
||||
System.exit(1);
|
||||
}
|
||||
|
||||
GenomeLocParser.setupRefContigOrdering(reference.getSequenceDictionary());
|
||||
|
||||
int counter = 0;
|
||||
|
||||
while ( it.hasNext() && counter < 500 ) {
|
||||
rodGELI rg = it.next();
|
||||
System.out.println(rg.toString());
|
||||
counter++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -25,9 +25,16 @@ package org.broadinstitute.sting.gatk.refdata.tracks;
|
|||
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import net.sf.samtools.util.CloseableIterator;
|
||||
import org.broad.tribble.FeatureCodec;
|
||||
import org.broad.tribble.FeatureSource;
|
||||
import org.broad.tribble.source.BasicFeatureSource;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.FeatureToGATKFeatureIterator;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.lang.reflect.Type;
|
||||
import java.util.Iterator;
|
||||
|
||||
|
|
@ -39,7 +46,7 @@ import java.util.Iterator;
|
|||
* <p/>
|
||||
* the basics of what a reference metadata track must contain.
|
||||
*/
|
||||
public abstract class RMDTrack {
|
||||
public class RMDTrack {
|
||||
|
||||
// the basics of a track:
|
||||
private final Class type; // our type
|
||||
|
|
@ -47,20 +54,14 @@ public abstract class RMDTrack {
|
|||
private final String name; // the name
|
||||
private final File file; // the associated file we create the reader from
|
||||
|
||||
/**
|
||||
* Create a track
|
||||
*
|
||||
* @param type the type of track, used for track lookup
|
||||
* @param recordType the type of record produced
|
||||
* @param name the name of this specific track
|
||||
* @param file the associated file, for reference or recreating the reader
|
||||
*/
|
||||
protected RMDTrack(Class type, Class recordType, String name, File file) {
|
||||
this.type = type;
|
||||
this.recordType = recordType;
|
||||
this.name = name;
|
||||
this.file = file;
|
||||
}
|
||||
// our feature reader - allows queries
|
||||
private BasicFeatureSource reader;
|
||||
|
||||
// our sequence dictionary, which can be null
|
||||
private final SAMSequenceDictionary dictionary;
|
||||
|
||||
// our codec type
|
||||
private final FeatureCodec codec;
|
||||
|
||||
public Class getType() {
|
||||
return type;
|
||||
|
|
@ -78,12 +79,6 @@ public abstract class RMDTrack {
|
|||
return recordType;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return how to get an iterator of the underlying data. This is all a track has to support,
|
||||
* but other more advanced tracks support the query interface
|
||||
*/
|
||||
public abstract CloseableIterator<GATKFeature> getIterator();
|
||||
|
||||
/**
|
||||
* helper function for determining if we are the same track based on name and record type
|
||||
*
|
||||
|
|
@ -96,21 +91,90 @@ public abstract class RMDTrack {
|
|||
return (name.equals(this.name) && (type.getClass().isAssignableFrom(this.type.getClass())));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Create a track
|
||||
*
|
||||
* @param type the type of track, used for track lookup
|
||||
* @param name the name of this specific track
|
||||
* @param file the associated file, for reference or recreating the reader
|
||||
* @param reader the feature reader to use as the underlying data source
|
||||
* @param dict the sam sequence dictionary
|
||||
* @param codec the feature codec we use to decode this type
|
||||
*/
|
||||
public RMDTrack(Class type, String name, File file, BasicFeatureSource reader, SAMSequenceDictionary dict, FeatureCodec codec) {
|
||||
this.type = type;
|
||||
this.recordType = codec.getFeatureType();
|
||||
this.name = name;
|
||||
this.file = file;
|
||||
this.reader = reader;
|
||||
this.dictionary = dict;
|
||||
this.codec = codec;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return how to get an iterator of the underlying data. This is all a track has to support,
|
||||
* but other more advanced tracks support the query interface
|
||||
*/
|
||||
public CloseableIterator<GATKFeature> getIterator() {
|
||||
try {
|
||||
return new FeatureToGATKFeatureIterator(reader.iterator(),this.getName());
|
||||
} catch (IOException e) {
|
||||
throw new UserException.CouldNotReadInputFile(getFile(), "Unable to read from file", e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* do we support the query interface?
|
||||
* @return true if we can be cast to the QueryableTrack interface
|
||||
*
|
||||
* @return true
|
||||
*/
|
||||
public abstract boolean supportsQuery();
|
||||
public boolean supportsQuery() {
|
||||
return true;
|
||||
}
|
||||
|
||||
public CloseableIterator<GATKFeature> query(GenomeLoc interval) throws IOException {
|
||||
return new FeatureToGATKFeatureIterator(reader.query(interval.getContig(),(int)interval.getStart(),(int)interval.getStop()),this.getName());
|
||||
}
|
||||
|
||||
public CloseableIterator<GATKFeature> query(GenomeLoc interval, boolean contained) throws IOException {
|
||||
return new FeatureToGATKFeatureIterator(reader.query(interval.getContig(),(int)interval.getStart(),(int)interval.getStop()),this.getName());
|
||||
}
|
||||
|
||||
public CloseableIterator<GATKFeature> query(String contig, int start, int stop) throws IOException {
|
||||
return new FeatureToGATKFeatureIterator(reader.query(contig,start,stop),this.getName());
|
||||
}
|
||||
|
||||
public CloseableIterator<GATKFeature> query(String contig, int start, int stop, boolean contained) throws IOException {
|
||||
return new FeatureToGATKFeatureIterator(reader.query(contig,start,stop),this.getName());
|
||||
}
|
||||
|
||||
public void close() {
|
||||
try {
|
||||
reader.close();
|
||||
} catch (IOException e) {
|
||||
throw new UserException.MalformedFile("Unable to close reader " + reader.toString(),e);
|
||||
}
|
||||
reader = null;
|
||||
}
|
||||
|
||||
public FeatureSource getReader() {
|
||||
return reader;
|
||||
}
|
||||
|
||||
/**
|
||||
* get the sequence dictionary from the track, if available
|
||||
* @return a SAMSequenceDictionary if available, null if unavailable
|
||||
*/
|
||||
public SAMSequenceDictionary getSequenceDictionary() {
|
||||
return null; // default, others can override this
|
||||
return dictionary;
|
||||
}
|
||||
|
||||
public Object getHeader() {
|
||||
return null;
|
||||
return reader.getHeader();
|
||||
}
|
||||
|
||||
public FeatureCodec getCodec() {
|
||||
return codec;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,183 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.refdata.tracks;
|
||||
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.utils.classloader.PluginManager;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Find the available track builders, and create the requisite tracks from the command line.
|
||||
*
|
||||
* In Tribble RMD tracks have two classes:
|
||||
* - a Feature that is the model/view for the data
|
||||
* - a Codec that is the controller to generate the Feature.
|
||||
*
|
||||
* In this class, the track types are the Codecs. The track record types are the Features.
|
||||
*/
|
||||
public class RMDTrackManager extends PluginManager<RMDTrackBuilder> {
|
||||
// the input strings we use to create RODs from
|
||||
List<RMDTriplet> inputs = new ArrayList<RMDTriplet>();
|
||||
|
||||
// create an active mapping of builder instances, and a map of the name -> class for convenience
|
||||
/** the tracks that are available to us, associated with their builder */
|
||||
Map<String, RMDTrackBuilder> availableTrackBuilders;
|
||||
/** the classes names, with their class description (think the Controller Codecs) */
|
||||
Map<String, Class> availableTrackTypes;
|
||||
/** the available track record types (think the Model/View Features) */
|
||||
Map<String, Class> availableTrackRecordTypes;
|
||||
|
||||
/** Create a new track plugin manager. */
|
||||
public RMDTrackManager() {
|
||||
super(RMDTrackBuilder.class, "TrackBuilders", null);
|
||||
}
|
||||
|
||||
/**
|
||||
* find the associated reference meta data
|
||||
*
|
||||
* @param bindings the bindings of strings from the -B command line option
|
||||
*
|
||||
* @return a list of RMDTracks, one for each -B option
|
||||
*/
|
||||
public List<RMDTrack> getReferenceMetaDataSources(GenomeAnalysisEngine engine,List<String> bindings) {
|
||||
initializeTrackTypes();
|
||||
initializeBindings(engine,bindings);
|
||||
// try and make the tracks given their requests
|
||||
return createRequestedTrackObjects();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns a collection of track names that match the record type.
|
||||
* @param trackRecordType the record type specified in the @RMD annotation
|
||||
* @return a collection of available track record type names that match the record type
|
||||
*/
|
||||
public Collection<String> getTrackRecordTypeNames(Class trackRecordType) {
|
||||
initializeTrackTypes();
|
||||
initializeTrackRecordTypes();
|
||||
Set<String> names = new TreeSet<String>();
|
||||
for (Map.Entry<String, Class> availableTrackRecordType: availableTrackRecordTypes.entrySet()) {
|
||||
if (trackRecordType.isAssignableFrom(availableTrackRecordType.getValue()))
|
||||
names.add(availableTrackRecordType.getKey());
|
||||
}
|
||||
return names;
|
||||
}
|
||||
|
||||
/**
|
||||
* initialize our lists of bindings
|
||||
* @param engine The engine, used to populate tags.
|
||||
* @param bindings the input to the GATK, as a list of strings passed in through the -B options
|
||||
*/
|
||||
private void initializeBindings(GenomeAnalysisEngine engine,List<String> bindings) {
|
||||
// NOTE: Method acts as a static. Once the inputs have been passed once they are locked in.
|
||||
if (inputs.size() > 0 || bindings.size() == 0)
|
||||
return;
|
||||
|
||||
for (String binding: bindings) {
|
||||
if(engine != null && engine.getTags(binding).size() == 2) {
|
||||
// Assume that if tags are present, those tags are name and type.
|
||||
// Name is always first, followed by type.
|
||||
List<String> parameters = engine.getTags(binding);
|
||||
String name = parameters.get(0);
|
||||
String type = parameters.get(1);
|
||||
inputs.add(new RMDTriplet(name,type,binding));
|
||||
}
|
||||
else {
|
||||
// Otherwise, use old-format bindings.
|
||||
String[] split = binding.split(",");
|
||||
if (split.length != 3) throw new IllegalArgumentException(binding + " is not a valid reference metadata track description");
|
||||
inputs.add(new RMDTriplet(split[0], split[1], split[2]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* initialize our lists of tracks and builders
|
||||
*/
|
||||
private void initializeTrackTypes() {
|
||||
if (availableTrackBuilders != null && availableTrackTypes != null)
|
||||
return;
|
||||
|
||||
// create an active mapping of builder instances, and a map of the name -> class for convenience
|
||||
availableTrackBuilders = new HashMap<String, RMDTrackBuilder>();
|
||||
availableTrackTypes = new HashMap<String, Class>();
|
||||
createBuilderObjects();
|
||||
}
|
||||
|
||||
/**
|
||||
* create the builder objects from the retrieved list
|
||||
*/
|
||||
private void createBuilderObjects() {
|
||||
// create a track builder instance for each track builder, and find out what tracks we can make
|
||||
for (String builderName : this.pluginsByName.keySet()) {
|
||||
RMDTrackBuilder builder = this.createByName(builderName);
|
||||
Map<String, Class> mapping = builder.getAvailableTrackNamesAndTypes();
|
||||
for (String name : mapping.keySet()) {
|
||||
availableTrackBuilders.put(name.toUpperCase(), builder);
|
||||
availableTrackTypes.put(name.toUpperCase(), mapping.get(name));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* initialize our list of track record types
|
||||
*/
|
||||
private void initializeTrackRecordTypes() {
|
||||
if (availableTrackRecordTypes != null)
|
||||
return;
|
||||
|
||||
availableTrackRecordTypes = new HashMap<String, Class>();
|
||||
for (RMDTrackBuilder builder : availableTrackBuilders.values()) {
|
||||
Map<String, Class> mapping = builder.getAvailableTrackNamesAndRecordTypes();
|
||||
for (String name : mapping.keySet()) {
|
||||
availableTrackRecordTypes.put(name.toUpperCase(), mapping.get(name));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* create the requested track objects
|
||||
*
|
||||
* @return a list of the tracks, one for each of the requested input tracks
|
||||
*/
|
||||
private List<RMDTrack> createRequestedTrackObjects() {
|
||||
// create of live instances of the tracks
|
||||
List<RMDTrack> tracks = new ArrayList<RMDTrack>();
|
||||
|
||||
// create instances of each of the requested types
|
||||
for (RMDTriplet trip : inputs) {
|
||||
RMDTrackBuilder b = availableTrackBuilders.get(trip.getType().toUpperCase());
|
||||
if (b == null) throw new UserException.CommandLineException("Unable to find track for " + trip.getType());
|
||||
tracks.add(b.createInstanceOfTrack(availableTrackTypes.get(trip.getType().toUpperCase()), trip.getName(), new File(trip.getFile())));
|
||||
}
|
||||
return tracks;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,82 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010. The Broad Institute
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.refdata.tracks;
|
||||
|
||||
import net.sf.samtools.util.CloseableIterator;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeatureIterator;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Iterator;
|
||||
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* @author aaron
|
||||
*
|
||||
* Class RODRMDTrack
|
||||
*
|
||||
* wrap a reference ordered data object in the new track style. This track will hopefully be phased-out as we move to
|
||||
* a FeatureReader based system.
|
||||
*/
|
||||
public class RODRMDTrack extends RMDTrack {
|
||||
|
||||
// our ROD
|
||||
private ReferenceOrderedData data;
|
||||
|
||||
/**
|
||||
* Create a track
|
||||
*
|
||||
* @param type the type of track, used for track lookup
|
||||
* @param name the name of this specific track
|
||||
* @param file the associated file, for reference or recreating the reader
|
||||
* @param data the ROD to use as the underlying data source for this track
|
||||
*/
|
||||
public RODRMDTrack(Class type, String name, File file, ReferenceOrderedData data) {
|
||||
super(type, type, name, file); // the decoder type and the record type are the same in this case
|
||||
this.data = data;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return how to get an iterator of the underlying data. This is all a track has to support,
|
||||
* but other more advanced tracks support the query interface
|
||||
*/
|
||||
@Override
|
||||
public CloseableIterator<GATKFeature> getIterator() {
|
||||
return new GATKFeatureIterator(data.iterator());
|
||||
}
|
||||
|
||||
/**
|
||||
* do we support the query interface?
|
||||
*
|
||||
* @return false
|
||||
*/
|
||||
@Override
|
||||
public boolean supportsQuery() {
|
||||
return false; // sadly no, we don't
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1,142 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010. The Broad Institute
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.refdata.tracks;
|
||||
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import net.sf.samtools.util.CloseableIterator;
|
||||
import org.broad.tribble.FeatureCodec;
|
||||
import org.broad.tribble.FeatureSource;
|
||||
import org.broad.tribble.source.BasicFeatureSource;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.FeatureToGATKFeatureIterator;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* @author aaron
|
||||
*
|
||||
* Class TribbleTrack
|
||||
*
|
||||
* A feature reader track, implementing the RMDTrack for tracks that are generated out of Tribble
|
||||
*/
|
||||
public class TribbleTrack extends RMDTrack implements QueryableTrack {
|
||||
// our feature reader - allows queries
|
||||
private BasicFeatureSource reader;
|
||||
|
||||
// our sequence dictionary, which can be null
|
||||
private final SAMSequenceDictionary dictionary;
|
||||
|
||||
// our codec type
|
||||
private final FeatureCodec codec;
|
||||
/**
|
||||
* Create a track
|
||||
*
|
||||
* @param type the type of track, used for track lookup
|
||||
* @param name the name of this specific track
|
||||
* @param file the associated file, for reference or recreating the reader
|
||||
* @param reader the feature reader to use as the underlying data source
|
||||
* @param dict the sam sequence dictionary
|
||||
* @param codec the feature codec we use to decode this type
|
||||
*/
|
||||
public TribbleTrack(Class type, String name, File file, BasicFeatureSource reader, SAMSequenceDictionary dict, FeatureCodec codec) {
|
||||
super(type, codec.getFeatureType(), name, file);
|
||||
this.reader = reader;
|
||||
this.dictionary = dict;
|
||||
this.codec = codec;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return how to get an iterator of the underlying data. This is all a track has to support,
|
||||
* but other more advanced tracks support the query interface
|
||||
*/
|
||||
@Override
|
||||
public CloseableIterator<GATKFeature> getIterator() {
|
||||
try {
|
||||
return new FeatureToGATKFeatureIterator(reader.iterator(),this.getName());
|
||||
} catch (IOException e) {
|
||||
throw new UserException.CouldNotReadInputFile(getFile(), "Unable to read from file", e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* do we support the query interface?
|
||||
*
|
||||
* @return true
|
||||
*/
|
||||
@Override
|
||||
public boolean supportsQuery() {
|
||||
return true;
|
||||
}
|
||||
|
||||
public CloseableIterator<GATKFeature> query(GenomeLoc interval) throws IOException {
|
||||
return new FeatureToGATKFeatureIterator(reader.query(interval.getContig(),(int)interval.getStart(),(int)interval.getStop()),this.getName());
|
||||
}
|
||||
|
||||
public CloseableIterator<GATKFeature> query(GenomeLoc interval, boolean contained) throws IOException {
|
||||
return new FeatureToGATKFeatureIterator(reader.query(interval.getContig(),(int)interval.getStart(),(int)interval.getStop()),this.getName());
|
||||
}
|
||||
|
||||
public CloseableIterator<GATKFeature> query(String contig, int start, int stop) throws IOException {
|
||||
return new FeatureToGATKFeatureIterator(reader.query(contig,start,stop),this.getName());
|
||||
}
|
||||
|
||||
public CloseableIterator<GATKFeature> query(String contig, int start, int stop, boolean contained) throws IOException {
|
||||
return new FeatureToGATKFeatureIterator(reader.query(contig,start,stop),this.getName());
|
||||
}
|
||||
|
||||
public void close() {
|
||||
try {
|
||||
reader.close();
|
||||
} catch (IOException e) {
|
||||
throw new ReviewedStingException("Unable to close reader " + reader.toString(),e);
|
||||
}
|
||||
reader = null;
|
||||
}
|
||||
|
||||
public FeatureSource getReader() {
|
||||
return reader;
|
||||
}
|
||||
|
||||
/**
|
||||
* get the sequence dictionary from the track, if available
|
||||
* @return a SAMSequenceDictionary if available, null if unavailable
|
||||
*/
|
||||
public SAMSequenceDictionary getSequenceDictionary() {
|
||||
return dictionary;
|
||||
}
|
||||
|
||||
public Object getHeader() {
|
||||
return reader.getHeader();
|
||||
}
|
||||
|
||||
public FeatureCodec getCodec() {
|
||||
return codec;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,5 +1,6 @@
|
|||
/*
|
||||
* Copyright (c) 2010. The Broad Institute
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
|
|
@ -11,41 +12,88 @@
|
|||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.refdata.tracks.builders;
|
||||
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.*;
|
||||
import org.broad.tribble.index.Index;
|
||||
import org.broad.tribble.index.IndexFactory;
|
||||
import org.broad.tribble.source.BasicFeatureSource;
|
||||
import org.broad.tribble.util.LittleEndianOutputStream;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackCreationException;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.classloader.PluginManager;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.file.FSLockWithShared;
|
||||
import org.broadinstitute.sting.utils.file.FileSystemInabilityToLockException;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Map;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
* @author aaron
|
||||
* <p/>
|
||||
* Interface RMDTrackBuilder
|
||||
* <p/>
|
||||
* The basic interface for finding and parsing RMDTracks. Track builders present an interface that allows
|
||||
* the track manager to find and create tracks of the specified type.
|
||||
*
|
||||
* @author aaron
|
||||
*
|
||||
* Class RMDTrackBuilder
|
||||
*
|
||||
* This class keeps track of the available codecs, and knows how to put together a track of
|
||||
* that gets iterators from the FeatureReader using Tribble.
|
||||
*
|
||||
*/
|
||||
public interface RMDTrackBuilder {
|
||||
public class RMDTrackBuilder extends PluginManager<FeatureCodec> {
|
||||
/**
|
||||
* our log, which we want to capture anything from this class
|
||||
*/
|
||||
private static Logger logger = Logger.getLogger(RMDTrackBuilder.class);
|
||||
|
||||
/** @return a list of all available tracks types we currently have access to create */
|
||||
public Map<String, Class> getAvailableTrackNamesAndTypes();
|
||||
// the input strings we use to create RODs from
|
||||
List<RMDTriplet> inputs = new ArrayList<RMDTriplet>();
|
||||
|
||||
// the linear index extension
|
||||
public static final String indexExtension = ".idx";
|
||||
|
||||
private Map<String, Class> classes = null;
|
||||
|
||||
/** Create a new plugin manager. */
|
||||
public RMDTrackBuilder() {
|
||||
super(FeatureCodec.class, "Codecs", "Codec");
|
||||
}
|
||||
|
||||
/** @return a list of all available track types we currently have access to create */
|
||||
public Map<String, Class> getAvailableTrackNamesAndTypes() {
|
||||
classes = new HashMap<String, Class>();
|
||||
for (String name: this.pluginsByName.keySet()) {
|
||||
classes.put(name.toUpperCase(), pluginsByName.get(name));
|
||||
}
|
||||
return classes;
|
||||
}
|
||||
|
||||
/** @return a list of all available track record types we currently have access to create */
|
||||
public Map<String, Class> getAvailableTrackNamesAndRecordTypes();
|
||||
public Map<String, Class> getAvailableTrackNamesAndRecordTypes() {
|
||||
HashMap classToRecord = new HashMap<String, Class>();
|
||||
for (String name: this.pluginsByName.keySet()) {
|
||||
FeatureCodec codec = this.createByName(name);
|
||||
classToRecord.put(name, codec.getFeatureType());
|
||||
}
|
||||
return classToRecord;
|
||||
}
|
||||
|
||||
/**
|
||||
* create a RMDTrack of the specified type
|
||||
|
|
@ -55,7 +103,317 @@ public interface RMDTrackBuilder {
|
|||
* @param inputFile the input file
|
||||
*
|
||||
* @return an instance of the track
|
||||
* @throws RMDTrackCreationException if we don't know of the target class or we couldn't create it
|
||||
* @throws RMDTrackCreationException
|
||||
* if we don't know of the target class or we couldn't create it
|
||||
*/
|
||||
public RMDTrack createInstanceOfTrack(Class targetClass, String name, File inputFile) throws RMDTrackCreationException;
|
||||
public RMDTrack createInstanceOfTrack(Class targetClass, String name, File inputFile) throws RMDTrackCreationException {
|
||||
// return a feature reader track
|
||||
Pair<BasicFeatureSource, SAMSequenceDictionary> pair = createFeatureReader(targetClass, name, inputFile);
|
||||
if (pair == null) throw new UserException.CouldNotReadInputFile(inputFile, "Unable to make the feature reader for input file");
|
||||
return new RMDTrack(targetClass, name, inputFile, pair.first, pair.second, createCodec(targetClass, name));
|
||||
}
|
||||
|
||||
/**
|
||||
* create a tribble feature reader class, given the target class and the input file
|
||||
* @param targetClass the target class, of a Tribble Codec type
|
||||
* @param inputFile the input file, that corresponds to the feature type
|
||||
* @return a pair of <BasicFeatureSource, SAMSequenceDictionary>
|
||||
*/
|
||||
public Pair<BasicFeatureSource, SAMSequenceDictionary> createFeatureReader(Class targetClass, File inputFile) {
|
||||
return createFeatureReader(targetClass, "anonymous", inputFile);
|
||||
}
|
||||
|
||||
/**
|
||||
* create a feature reader of the specified type
|
||||
* @param targetClass the target codec type
|
||||
* @param name the target name
|
||||
* @param inputFile the input file to create the track from (of the codec type)
|
||||
* @return the FeatureReader instance
|
||||
*/
|
||||
public Pair<BasicFeatureSource, SAMSequenceDictionary> createFeatureReader(Class targetClass, String name, File inputFile) {
|
||||
Pair<BasicFeatureSource, SAMSequenceDictionary> pair;
|
||||
if (inputFile.getAbsolutePath().endsWith(".gz"))
|
||||
pair = createBasicFeatureSourceNoAssumedIndex(targetClass, name, inputFile);
|
||||
else
|
||||
pair = getLinearFeatureReader(targetClass, name, inputFile);
|
||||
return pair;
|
||||
}
|
||||
|
||||
/**
|
||||
* create a feature reader, without assuming there exists an index. This code assumes the feature
|
||||
* reader of the appropriate type will figure out what the right index type is, and determine if it
|
||||
* exists.
|
||||
*
|
||||
* @param targetClass the codec class type
|
||||
* @param name the name of the track
|
||||
* @param inputFile the file to load
|
||||
* @return a feature reader implementation
|
||||
*/
|
||||
private Pair<BasicFeatureSource, SAMSequenceDictionary> createBasicFeatureSourceNoAssumedIndex(Class targetClass, String name, File inputFile) {
|
||||
// we might not know the index type, try loading with the default reader constructor
|
||||
logger.info("Attempting to blindly load " + inputFile + " as a tabix indexed file");
|
||||
try {
|
||||
return new Pair<BasicFeatureSource, SAMSequenceDictionary>(BasicFeatureSource.getFeatureSource(inputFile.getAbsolutePath(), createCodec(targetClass, name)),null);
|
||||
} catch (IOException e) {
|
||||
throw new UserException.CouldNotReadInputFile(inputFile, "Unable to create feature reader from file", e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* add a name to the codec, if it takes one
|
||||
* @param targetClass the class to create a codec for
|
||||
* @param name the name to assign this codec
|
||||
* @return the feature codec itself
|
||||
*/
|
||||
private FeatureCodec createCodec(Class targetClass, String name) {
|
||||
FeatureCodec codex = this.createByType(targetClass);
|
||||
if ( codex instanceof NameAwareCodec )
|
||||
((NameAwareCodec)codex).setName(name);
|
||||
return codex;
|
||||
}
|
||||
|
||||
/**
|
||||
* create a linear feature reader, where we create the index ahead of time
|
||||
* @param targetClass the target class
|
||||
* @param name the name of the codec
|
||||
* @param inputFile the tribble file to parse
|
||||
* @return the input file as a FeatureReader
|
||||
*/
|
||||
private Pair<BasicFeatureSource, SAMSequenceDictionary> getLinearFeatureReader(Class targetClass, String name, File inputFile) {
|
||||
Pair<BasicFeatureSource, SAMSequenceDictionary> reader;
|
||||
try {
|
||||
Index index = loadIndex(inputFile, createCodec(targetClass, name), true);
|
||||
reader = new Pair<BasicFeatureSource, SAMSequenceDictionary>(new BasicFeatureSource(inputFile.getAbsolutePath(),
|
||||
index,
|
||||
createCodec(targetClass, name)),
|
||||
sequenceSetToDictionary(index.getSequenceNames()));
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new UserException.CouldNotReadInputFile(inputFile, "Unable to create reader with file", e);
|
||||
} catch (IOException e) {
|
||||
throw new UserException("Unable to make the index file for " + inputFile, e);
|
||||
}
|
||||
return reader;
|
||||
}
|
||||
|
||||
/**
|
||||
* create an index for the input file
|
||||
* @param inputFile the input file
|
||||
* @param codec the codec to use
|
||||
* @param onDisk write the index to disk?
|
||||
* @return a linear index for the specified type
|
||||
* @throws IOException if we cannot write the index file
|
||||
*/
|
||||
public synchronized static Index loadIndex(File inputFile, FeatureCodec codec, boolean onDisk) throws IOException {
|
||||
|
||||
// create the index file name, locking on the index file name
|
||||
File indexFile = new File(inputFile.getAbsoluteFile() + indexExtension);
|
||||
FSLockWithShared lock = new FSLockWithShared(indexFile);
|
||||
|
||||
// acquire a lock on the file
|
||||
Index idx = null;
|
||||
if (indexFile.canRead())
|
||||
idx = attemptIndexFromDisk(inputFile, codec, indexFile, lock);
|
||||
|
||||
// if we managed to make an index, return
|
||||
if (idx != null) return idx;
|
||||
|
||||
// we couldn't read the file, or we fell out of the conditions above, continue on to making a new index
|
||||
return createNewIndex(inputFile, codec, onDisk, indexFile, lock);
|
||||
}
|
||||
|
||||
/**
|
||||
* attempt to read the index from disk
|
||||
* @param inputFile the input file
|
||||
* @param codec the codec to read from
|
||||
* @param indexFile the index file itself
|
||||
* @param lock the lock file
|
||||
* @return an index, or null if we couldn't load one
|
||||
* @throws IOException if we fail for FS issues
|
||||
*/
|
||||
protected static Index attemptIndexFromDisk(File inputFile, FeatureCodec codec, File indexFile, FSLockWithShared lock) throws IOException {
|
||||
boolean locked;
|
||||
try {
|
||||
locked = lock.sharedLock();
|
||||
}
|
||||
catch(FileSystemInabilityToLockException ex) {
|
||||
throw new UserException.MissortedFile(inputFile, "Unexpected inability to lock exception", ex);
|
||||
}
|
||||
Index idx;
|
||||
try {
|
||||
if (!locked) // can't lock file
|
||||
idx = createIndexInMemory(inputFile, codec);
|
||||
else
|
||||
idx = loadFromDisk(inputFile, indexFile);
|
||||
} finally {
|
||||
if (locked) lock.unlock();
|
||||
}
|
||||
return idx;
|
||||
}
|
||||
|
||||
/**
|
||||
* load the index from disk, checking for out of date indexes and old versions (both of which are deleted)
|
||||
* @param inputFile the input file
|
||||
* @param indexFile the input file, plus the index extension
|
||||
* @return an Index, or null if we're unable to load
|
||||
*/
|
||||
public static Index loadFromDisk(File inputFile, File indexFile) {
|
||||
logger.info("Loading Tribble index from disk for file " + inputFile);
|
||||
Index index = IndexFactory.loadIndex(indexFile.getAbsolutePath());
|
||||
|
||||
// check if the file is up-to date (filestamp and version check)
|
||||
if (index.isCurrentVersion() && indexFile.lastModified() > inputFile.lastModified())
|
||||
return index;
|
||||
else if (indexFile.lastModified() < inputFile.lastModified())
|
||||
logger.warn("Index file " + indexFile + " is out of date (index older than input file), deleting and updating the index file");
|
||||
else // we've loaded an old version of the index, we want to remove it <-- currently not used, but may re-enable
|
||||
logger.warn("Index file " + indexFile + " is out of date (old version), deleting and updating the index file");
|
||||
|
||||
// however we got here, remove the index and return null
|
||||
boolean deleted = indexFile.delete();
|
||||
|
||||
if (!deleted) logger.warn("Index file " + indexFile + " is out of date, but could not be removed; it will not be trusted (we'll try to rebuild an in-memory copy)");
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* attempt to create the index, and write it to disk
|
||||
* @param inputFile the input file
|
||||
* @param codec the codec to use
|
||||
* @param onDisk if they asked for disk storage or now
|
||||
* @param indexFile the index file location
|
||||
* @param lock the locking object
|
||||
* @return the index object
|
||||
* @throws IOException when unable to create the new index
|
||||
*/
|
||||
private static Index createNewIndex(File inputFile, FeatureCodec codec, boolean onDisk, File indexFile, FSLockWithShared lock) throws IOException {
|
||||
Index index = createIndexInMemory(inputFile, codec);
|
||||
|
||||
boolean locked = false; // could we exclusive lock the file?
|
||||
try {
|
||||
locked = lock.exclusiveLock();
|
||||
if (locked) {
|
||||
logger.info("Writing Tribble index to disk for file " + inputFile);
|
||||
LittleEndianOutputStream stream = new LittleEndianOutputStream(new FileOutputStream(indexFile));
|
||||
index.write(stream);
|
||||
stream.close();
|
||||
}
|
||||
else // we can't write it to disk, just store it in memory, tell them this
|
||||
if (onDisk) logger.info("Unable to write to " + indexFile + " for the index file, creating index in memory only");
|
||||
return index;
|
||||
}
|
||||
catch(FileSystemInabilityToLockException ex) {
|
||||
throw new UserException.MissortedFile(inputFile,"Unexpected inability to lock exception", ex);
|
||||
}
|
||||
finally {
|
||||
if (locked) lock.unlock();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* create the index in memory, given the input file and feature codec
|
||||
* @param inputFile the input file
|
||||
* @param codec the codec
|
||||
* @return a LinearIndex, given the file location
|
||||
* @throws IOException when unable to create the index in memory
|
||||
*/
|
||||
private static Index createIndexInMemory(File inputFile, FeatureCodec codec) throws IOException {
|
||||
// this can take a while, let them know what we're doing
|
||||
logger.info("Creating Tribble index in memory for file " + inputFile);
|
||||
return IndexFactory.createIndex(inputFile, codec, IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME);
|
||||
}
|
||||
|
||||
/**
|
||||
* convert a list of Strings into a sequence dictionary
|
||||
* @param contigList the contig list, in coordinate order, this is allowed to be null
|
||||
* @return a SAMSequenceDictionary, WITHOUT contig sizes
|
||||
*/
|
||||
private static SAMSequenceDictionary sequenceSetToDictionary(LinkedHashSet<String> contigList) {
|
||||
SAMSequenceDictionary dict = new SAMSequenceDictionary();
|
||||
if (contigList == null) return dict;
|
||||
|
||||
for (String name : contigList) {
|
||||
SAMSequenceRecord seq = new SAMSequenceRecord(name, 0);
|
||||
dict.addSequence(seq);
|
||||
}
|
||||
return dict;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a collection of track names that match the record type.
|
||||
* @param trackRecordType the record type specified in the @RMD annotation
|
||||
* @return a collection of available track record type names that match the record type
|
||||
*/
|
||||
public Collection<String> getTrackRecordTypeNames(Class trackRecordType) {
|
||||
Set<String> names = new TreeSet<String>();
|
||||
if (trackRecordType == null)
|
||||
throw new IllegalArgumentException("trackRecordType value is null, please pass in an actual class object");
|
||||
|
||||
for (Map.Entry<String, Class> availableTrackRecordType: getAvailableTrackNamesAndRecordTypes().entrySet()) {
|
||||
if (availableTrackRecordType.getValue() != null && trackRecordType.isAssignableFrom(availableTrackRecordType.getValue()))
|
||||
names.add(availableTrackRecordType.getKey());
|
||||
}
|
||||
return names;
|
||||
}
|
||||
|
||||
/**
|
||||
* find the associated reference meta data
|
||||
*
|
||||
* @param bindings the bindings of strings from the -B command line option
|
||||
*
|
||||
* @return a list of RMDTracks, one for each -B option
|
||||
*/
|
||||
public List<RMDTrack> getReferenceMetaDataSources(GenomeAnalysisEngine engine,List<String> bindings) {
|
||||
initializeBindings(engine,bindings);
|
||||
// try and make the tracks given their requests
|
||||
return createRequestedTrackObjects();
|
||||
}
|
||||
|
||||
/**
|
||||
* initialize our lists of bindings
|
||||
* @param engine The engine, used to populate tags.
|
||||
* @param bindings the input to the GATK, as a list of strings passed in through the -B options
|
||||
*/
|
||||
private void initializeBindings(GenomeAnalysisEngine engine,List<String> bindings) {
|
||||
// NOTE: Method acts as a static. Once the inputs have been passed once they are locked in.
|
||||
if (inputs.size() > 0 || bindings.size() == 0)
|
||||
return;
|
||||
|
||||
for (String binding: bindings) {
|
||||
if(engine != null && engine.getTags(binding).size() == 2) {
|
||||
// Assume that if tags are present, those tags are name and type.
|
||||
// Name is always first, followed by type.
|
||||
List<String> parameters = engine.getTags(binding);
|
||||
String name = parameters.get(0);
|
||||
String type = parameters.get(1);
|
||||
inputs.add(new RMDTriplet(name,type,binding));
|
||||
}
|
||||
else {
|
||||
// Otherwise, use old-format bindings.
|
||||
String[] split = binding.split(",");
|
||||
if (split.length != 3) throw new IllegalArgumentException(binding + " is not a valid reference metadata track description");
|
||||
inputs.add(new RMDTriplet(split[0], split[1], split[2]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* create the requested track objects
|
||||
*
|
||||
* @return a list of the tracks, one for each of the requested input tracks
|
||||
*/
|
||||
private List<RMDTrack> createRequestedTrackObjects() {
|
||||
// create of live instances of the tracks
|
||||
List<RMDTrack> tracks = new ArrayList<RMDTrack>();
|
||||
|
||||
// create instances of each of the requested types
|
||||
for (RMDTriplet trip : inputs) {
|
||||
Class featureCodecClass = getAvailableTrackNamesAndTypes().get(trip.getType().toUpperCase());
|
||||
if (featureCodecClass == null)
|
||||
throw new UserException.BadArgumentValue("-B",trip.getType());
|
||||
tracks.add(createInstanceOfTrack(featureCodecClass, trip.getName(), new File(trip.getFile())));
|
||||
}
|
||||
return tracks;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,107 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010. The Broad Institute
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.refdata.tracks.builders;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.refdata.*;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackCreationException;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.RODRMDTrack;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
/**
|
||||
* @author aaron
|
||||
* <p/>
|
||||
* Class RODTrackBuilder
|
||||
* <p/>
|
||||
* the builder for tracks of the current ROD system, a holdover until Tribble supports binary and multi-line formats
|
||||
*/
|
||||
public class RODTrackBuilder implements RMDTrackBuilder {
|
||||
|
||||
/** our log, which we want to capture anything from this class */
|
||||
private static Logger logger = Logger.getLogger(ReferenceOrderedData.class);
|
||||
|
||||
/**
|
||||
* the bindings from track name to the ROD class we use
|
||||
*/
|
||||
private static HashMap<String, Class<? extends ReferenceOrderedDatum>> Types = new HashMap<String, Class<? extends ReferenceOrderedDatum>>();
|
||||
|
||||
static {
|
||||
// All known ROD types
|
||||
Types.put("GELI", rodGELI.class);
|
||||
Types.put("Table", TabularROD.class);
|
||||
Types.put("Intervals", IntervalRod.class);
|
||||
Types.put("Plink", PlinkRod.class);
|
||||
}
|
||||
|
||||
/**
|
||||
* create a RMDTrack of the specified type
|
||||
*
|
||||
* @param targetClass the target class of track
|
||||
* @param name what to call the track
|
||||
* @param inputFile the input file
|
||||
*
|
||||
* @return an instance of the track
|
||||
* @throws org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackCreationException
|
||||
* if we don't know of the target class or we couldn't create it
|
||||
*/
|
||||
//@Override
|
||||
public RMDTrack createInstanceOfTrack(Class targetClass, String name, File inputFile) throws RMDTrackCreationException {
|
||||
return new RODRMDTrack(targetClass, name, inputFile, createROD(name,targetClass,inputFile));
|
||||
}
|
||||
|
||||
/** @return a map of all available track types we currently have access to create */
|
||||
@Override
|
||||
public Map<String, Class> getAvailableTrackNamesAndTypes() {
|
||||
return new HashMap<String, Class>(Types);
|
||||
}
|
||||
|
||||
/** @return a map of all available track record types we currently have access to create */
|
||||
@Override
|
||||
public Map<String, Class> getAvailableTrackNamesAndRecordTypes() {
|
||||
return new HashMap<String, Class>(Types);
|
||||
}
|
||||
|
||||
/**
|
||||
* Helpful function that parses a single triplet of <name> <type> <file> and returns the corresponding ROD with
|
||||
* <name>, of type <type> that reads its input from <file>.
|
||||
*
|
||||
* @param trackName the name of the track to create
|
||||
* @param type the type of the track to create
|
||||
* @param fileName the filename to create the track from
|
||||
* @return a reference ordered data track
|
||||
*/
|
||||
public ReferenceOrderedData createROD(final String trackName, Class type, File fileName) {
|
||||
|
||||
// Create the ROD
|
||||
ReferenceOrderedData<?> rod = new ReferenceOrderedData<ReferenceOrderedDatum>(trackName, fileName, type );
|
||||
logger.info(String.format("Created binding from %s to %s of type %s", trackName, fileName, type));
|
||||
return rod;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -1,334 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.refdata.tracks.builders;
|
||||
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.*;
|
||||
import org.broad.tribble.index.Index;
|
||||
import org.broad.tribble.index.IndexFactory;
|
||||
import org.broad.tribble.source.BasicFeatureSource;
|
||||
import org.broad.tribble.util.LittleEndianOutputStream;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.TribbleTrack;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackCreationException;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.classloader.PluginManager;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.file.FSLockWithShared;
|
||||
import org.broadinstitute.sting.utils.file.FileSystemInabilityToLockException;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* @author aaron
|
||||
*
|
||||
* Class TribbleRMDTrackBuilder
|
||||
*
|
||||
* This class keeps track of the available codecs, and knows how to put together a track of
|
||||
* that gets iterators from the FeatureReader using Tribble.
|
||||
*
|
||||
*/
|
||||
public class TribbleRMDTrackBuilder extends PluginManager<FeatureCodec> implements RMDTrackBuilder {
|
||||
/**
|
||||
* our log, which we want to capture anything from this class
|
||||
*/
|
||||
private static Logger logger = Logger.getLogger(TribbleRMDTrackBuilder.class);
|
||||
|
||||
// the linear index extension
|
||||
public static final String indexExtension = ".idx";
|
||||
|
||||
/** Create a new plugin manager. */
|
||||
public TribbleRMDTrackBuilder() {
|
||||
super(FeatureCodec.class, "Codecs", "Codec");
|
||||
}
|
||||
|
||||
/** @return a list of all available track types we currently have access to create */
|
||||
public Map<String, Class> getAvailableTrackNamesAndTypes() {
|
||||
return new HashMap<String, Class>(this.pluginsByName);
|
||||
}
|
||||
|
||||
/** @return a list of all available track record types we currently have access to create */
|
||||
public Map<String, Class> getAvailableTrackNamesAndRecordTypes() {
|
||||
Map<String, Class> classes = new HashMap<String, Class>();
|
||||
for (String name: this.pluginsByName.keySet()) {
|
||||
FeatureCodec codec = this.createByName(name);
|
||||
classes.put(name, codec.getFeatureType());
|
||||
}
|
||||
return classes;
|
||||
}
|
||||
|
||||
/**
|
||||
* create a RMDTrack of the specified type
|
||||
*
|
||||
* @param targetClass the target class of track
|
||||
* @param name what to call the track
|
||||
* @param inputFile the input file
|
||||
*
|
||||
* @return an instance of the track
|
||||
* @throws RMDTrackCreationException
|
||||
* if we don't know of the target class or we couldn't create it
|
||||
*/
|
||||
public RMDTrack createInstanceOfTrack(Class targetClass, String name, File inputFile) throws RMDTrackCreationException {
|
||||
// return a feature reader track
|
||||
Pair<BasicFeatureSource, SAMSequenceDictionary> pair = createFeatureReader(targetClass, name, inputFile);
|
||||
if (pair == null) throw new UserException.CouldNotReadInputFile(inputFile, "Unable to make the feature reader for input file");
|
||||
return new TribbleTrack(targetClass, name, inputFile, pair.first, pair.second, createCodec(targetClass, name));
|
||||
}
|
||||
|
||||
/**
|
||||
* create a tribble feature reader class, given the target class and the input file
|
||||
* @param targetClass the target class, of a Tribble Codec type
|
||||
* @param inputFile the input file, that corresponds to the feature type
|
||||
* @return a pair of <BasicFeatureSource, SAMSequenceDictionary>
|
||||
*/
|
||||
public Pair<BasicFeatureSource, SAMSequenceDictionary> createFeatureReader(Class targetClass, File inputFile) {
|
||||
return createFeatureReader(targetClass, "anonymous", inputFile);
|
||||
}
|
||||
|
||||
/**
|
||||
* create a feature reader of the specified type
|
||||
* @param targetClass the target codec type
|
||||
* @param name the target name
|
||||
* @param inputFile the input file to create the track from (of the codec type)
|
||||
* @return the FeatureReader instance
|
||||
*/
|
||||
public Pair<BasicFeatureSource, SAMSequenceDictionary> createFeatureReader(Class targetClass, String name, File inputFile) {
|
||||
Pair<BasicFeatureSource, SAMSequenceDictionary> pair;
|
||||
if (inputFile.getAbsolutePath().endsWith(".gz"))
|
||||
pair = createBasicFeatureSourceNoAssumedIndex(targetClass, name, inputFile);
|
||||
else
|
||||
pair = getLinearFeatureReader(targetClass, name, inputFile);
|
||||
return pair;
|
||||
}
|
||||
|
||||
/**
|
||||
* create a feature reader, without assuming there exists an index. This code assumes the feature
|
||||
* reader of the appropriate type will figure out what the right index type is, and determine if it
|
||||
* exists.
|
||||
*
|
||||
* @param targetClass the codec class type
|
||||
* @param name the name of the track
|
||||
* @param inputFile the file to load
|
||||
* @return a feature reader implementation
|
||||
*/
|
||||
private Pair<BasicFeatureSource, SAMSequenceDictionary> createBasicFeatureSourceNoAssumedIndex(Class targetClass, String name, File inputFile) {
|
||||
// we might not know the index type, try loading with the default reader constructor
|
||||
logger.info("Attempting to blindly load " + inputFile + " as a tabix indexed file");
|
||||
try {
|
||||
return new Pair<BasicFeatureSource, SAMSequenceDictionary>(BasicFeatureSource.getFeatureSource(inputFile.getAbsolutePath(), createCodec(targetClass, name)),null);
|
||||
} catch (IOException e) {
|
||||
throw new UserException.CouldNotReadInputFile(inputFile, "Unable to create feature reader from file", e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* add a name to the codec, if it takes one
|
||||
* @param targetClass the class to create a codec for
|
||||
* @param name the name to assign this codec
|
||||
* @return the feature codec itself
|
||||
*/
|
||||
private FeatureCodec createCodec(Class targetClass, String name) {
|
||||
FeatureCodec codex = this.createByType(targetClass);
|
||||
if ( codex instanceof NameAwareCodec )
|
||||
((NameAwareCodec)codex).setName(name);
|
||||
return codex;
|
||||
}
|
||||
|
||||
/**
|
||||
* create a linear feature reader, where we create the index ahead of time
|
||||
* @param targetClass the target class
|
||||
* @param name the name of the codec
|
||||
* @param inputFile the tribble file to parse
|
||||
* @return the input file as a FeatureReader
|
||||
*/
|
||||
private Pair<BasicFeatureSource, SAMSequenceDictionary> getLinearFeatureReader(Class targetClass, String name, File inputFile) {
|
||||
Pair<BasicFeatureSource, SAMSequenceDictionary> reader;
|
||||
try {
|
||||
Index index = loadIndex(inputFile, createCodec(targetClass, name), true);
|
||||
reader = new Pair<BasicFeatureSource, SAMSequenceDictionary>(new BasicFeatureSource(inputFile.getAbsolutePath(),
|
||||
index,
|
||||
createCodec(targetClass, name)),
|
||||
sequenceSetToDictionary(index.getSequenceNames()));
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new UserException.CouldNotReadInputFile(inputFile, "Unable to create reader with file", e);
|
||||
} catch (IOException e) {
|
||||
throw new UserException("Unable to make the index file for " + inputFile, e);
|
||||
}
|
||||
return reader;
|
||||
}
|
||||
|
||||
/**
|
||||
* create an index for the input file
|
||||
* @param inputFile the input file
|
||||
* @param codec the codec to use
|
||||
* @param onDisk write the index to disk?
|
||||
* @return a linear index for the specified type
|
||||
* @throws IOException if we cannot write the index file
|
||||
*/
|
||||
public synchronized static Index loadIndex(File inputFile, FeatureCodec codec, boolean onDisk) throws IOException {
|
||||
|
||||
// create the index file name, locking on the index file name
|
||||
File indexFile = new File(inputFile.getAbsoluteFile() + indexExtension);
|
||||
FSLockWithShared lock = new FSLockWithShared(indexFile);
|
||||
|
||||
// acquire a lock on the file
|
||||
Index idx = null;
|
||||
if (indexFile.canRead())
|
||||
idx = attemptIndexFromDisk(inputFile, codec, indexFile, lock);
|
||||
|
||||
// if we managed to make an index, return
|
||||
if (idx != null) return idx;
|
||||
|
||||
// we couldn't read the file, or we fell out of the conditions above, continue on to making a new index
|
||||
return createNewIndex(inputFile, codec, onDisk, indexFile, lock);
|
||||
}
|
||||
|
||||
/**
|
||||
* attempt to read the index from disk
|
||||
* @param inputFile the input file
|
||||
* @param codec the codec to read from
|
||||
* @param indexFile the index file itself
|
||||
* @param lock the lock file
|
||||
* @return an index, or null if we couldn't load one
|
||||
* @throws IOException if we fail for FS issues
|
||||
*/
|
||||
protected static Index attemptIndexFromDisk(File inputFile, FeatureCodec codec, File indexFile, FSLockWithShared lock) throws IOException {
|
||||
boolean locked;
|
||||
try {
|
||||
locked = lock.sharedLock();
|
||||
}
|
||||
catch(FileSystemInabilityToLockException ex) {
|
||||
throw new ReviewedStingException("Unexpected inability to lock exception", ex);
|
||||
}
|
||||
Index idx;
|
||||
try {
|
||||
if (!locked) // can't lock file
|
||||
idx = createIndexInMemory(inputFile, codec);
|
||||
else
|
||||
idx = loadFromDisk(inputFile, indexFile);
|
||||
} finally {
|
||||
if (locked) lock.unlock();
|
||||
}
|
||||
return idx;
|
||||
}
|
||||
|
||||
/**
|
||||
* load the index from disk, checking for out of date indexes and old versions (both of which are deleted)
|
||||
* @param inputFile the input file
|
||||
* @param indexFile the input file, plus the index extension
|
||||
* @return an Index, or null if we're unable to load
|
||||
*/
|
||||
public static Index loadFromDisk(File inputFile, File indexFile) {
|
||||
logger.info("Loading Tribble index from disk for file " + inputFile);
|
||||
Index index = IndexFactory.loadIndex(indexFile.getAbsolutePath());
|
||||
|
||||
// check if the file is up-to date (filestamp and version check)
|
||||
if (index.isCurrentVersion() && indexFile.lastModified() > inputFile.lastModified())
|
||||
return index;
|
||||
else if (indexFile.lastModified() < inputFile.lastModified())
|
||||
logger.warn("Index file " + indexFile + " is out of date (index older than input file), deleting and updating the index file");
|
||||
else // we've loaded an old version of the index, we want to remove it <-- currently not used, but may re-enable
|
||||
logger.warn("Index file " + indexFile + " is out of date (old version), deleting and updating the index file");
|
||||
|
||||
// however we got here, remove the index and return null
|
||||
boolean deleted = indexFile.delete();
|
||||
|
||||
if (!deleted) logger.warn("Index file " + indexFile + " is out of date, but could not be removed; it will not be trusted (we'll try to rebuild an in-memory copy)");
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* attempt to create the index, and write it to disk
|
||||
* @param inputFile the input file
|
||||
* @param codec the codec to use
|
||||
* @param onDisk if they asked for disk storage or now
|
||||
* @param indexFile the index file location
|
||||
* @param lock the locking object
|
||||
* @return the index object
|
||||
* @throws IOException when unable to create the new index
|
||||
*/
|
||||
private static Index createNewIndex(File inputFile, FeatureCodec codec, boolean onDisk, File indexFile, FSLockWithShared lock) throws IOException {
|
||||
Index index = createIndexInMemory(inputFile, codec);
|
||||
|
||||
boolean locked = false; // could we exclusive lock the file?
|
||||
try {
|
||||
locked = lock.exclusiveLock();
|
||||
if (locked) {
|
||||
logger.info("Writing Tribble index to disk for file " + inputFile);
|
||||
LittleEndianOutputStream stream = new LittleEndianOutputStream(new FileOutputStream(indexFile));
|
||||
index.write(stream);
|
||||
stream.close();
|
||||
}
|
||||
else // we can't write it to disk, just store it in memory, tell them this
|
||||
if (onDisk) logger.info("Unable to write to " + indexFile + " for the index file, creating index in memory only");
|
||||
return index;
|
||||
}
|
||||
catch(FileSystemInabilityToLockException ex) {
|
||||
throw new ReviewedStingException("Unexpected inability to lock exception", ex);
|
||||
}
|
||||
finally {
|
||||
if (locked) lock.unlock();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* create the index in memory, given the input file and feature codec
|
||||
* @param inputFile the input file
|
||||
* @param codec the codec
|
||||
* @return a LinearIndex, given the file location
|
||||
* @throws IOException when unable to create the index in memory
|
||||
*/
|
||||
private static Index createIndexInMemory(File inputFile, FeatureCodec codec) throws IOException {
|
||||
// this can take a while, let them know what we're doing
|
||||
logger.info("Creating Tribble index in memory for file " + inputFile);
|
||||
return IndexFactory.createIndex(inputFile, codec, IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME);
|
||||
}
|
||||
|
||||
/**
|
||||
* convert a list of Strings into a sequence dictionary
|
||||
* @param contigList the contig list, in coordinate order, this is allowed to be null
|
||||
* @return a SAMSequenceDictionary, WITHOUT contig sizes
|
||||
*/
|
||||
private static SAMSequenceDictionary sequenceSetToDictionary(LinkedHashSet<String> contigList) {
|
||||
SAMSequenceDictionary dict = new SAMSequenceDictionary();
|
||||
if (contigList == null) return dict;
|
||||
|
||||
for (String name : contigList) {
|
||||
SAMSequenceRecord seq = new SAMSequenceRecord(name, 0);
|
||||
dict.addSequence(seq);
|
||||
}
|
||||
return dict;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -1,42 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.annotator;
|
||||
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broad.tribble.vcf.VCFHeaderLineType;
|
||||
import org.broad.tribble.vcf.VCFInfoHeaderLine;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.TabularROD;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.List;
|
||||
import java.util.Arrays;
|
||||
|
||||
|
||||
public class Alignability implements InfoFieldAnnotation {
|
||||
|
||||
public Map<String, Object> annotate(RefMetaDataTracker tracker,
|
||||
ReferenceContext ref,
|
||||
Map<String, StratifiedAlignmentContext> stratifiedContexts,
|
||||
VariantContext vc)
|
||||
{
|
||||
TabularROD record = tracker.lookup("alignability",TabularROD.class);
|
||||
if (record == null)
|
||||
return null;
|
||||
|
||||
if (record.get("alignability") == null)
|
||||
throw new RuntimeException("ERROR: alignability column not defined in alignability input.\n");
|
||||
|
||||
int value = Integer.parseInt(record.get("alignability"));
|
||||
|
||||
Map<String, Object> map = new HashMap<String, Object>();
|
||||
map.put(getKeyNames().get(0), String.format("%d", value));
|
||||
return map;
|
||||
}
|
||||
|
||||
public List<String> getKeyNames() { return Arrays.asList("Alignability"); }
|
||||
|
||||
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Integer, "Alignability according to a mask file (3 is best, 0 is worst)")); }
|
||||
}
|
||||
|
|
@ -33,7 +33,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
|||
import org.broadinstitute.sting.gatk.refdata.SeekableRODIterator;
|
||||
import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqCodec;
|
||||
import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqFeature;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.builders.TribbleRMDTrackBuilder;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
|
|
@ -399,7 +399,7 @@ public class DepthOfCoverageWalker extends LocusWalker<Map<DoCOutputType.Partiti
|
|||
}
|
||||
|
||||
private LocationAwareSeekableRODIterator initializeRefSeq() {
|
||||
TribbleRMDTrackBuilder builder = new TribbleRMDTrackBuilder();
|
||||
RMDTrackBuilder builder = new RMDTrackBuilder();
|
||||
FeatureSource refseq = builder.createFeatureReader(RefSeqCodec.class,refSeqGeneList).first;
|
||||
try {
|
||||
return new SeekableRODIterator(new FeatureToGATKFeatureIterator(refseq.iterator(),"refseq"));
|
||||
|
|
|
|||
|
|
@ -35,7 +35,7 @@ import org.broadinstitute.sting.gatk.filters.*;
|
|||
import org.broadinstitute.sting.gatk.refdata.*;
|
||||
import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqCodec;
|
||||
import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqFeature;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.builders.TribbleRMDTrackBuilder;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.FeatureToGATKFeatureIterator;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
|
||||
|
|
@ -216,7 +216,7 @@ public class IndelGenotyperV2Walker extends ReadWalker<Integer,Integer> {
|
|||
if ( RefseqFileName != null ) {
|
||||
logger.info("Using RefSeq annotations from "+RefseqFileName);
|
||||
|
||||
TribbleRMDTrackBuilder builder = new TribbleRMDTrackBuilder();
|
||||
RMDTrackBuilder builder = new RMDTrackBuilder();
|
||||
FeatureSource refseq = builder.createFeatureReader(RefSeqCodec.class,new File(RefseqFileName)).first;
|
||||
|
||||
try {
|
||||
|
|
|
|||
|
|
@ -31,11 +31,9 @@ import org.broad.tribble.util.variantcontext.Allele;
|
|||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
|
||||
import org.broadinstitute.sting.gatk.refdata.*;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.builders.TribbleRMDTrackBuilder;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeatureIterator;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
|
||||
|
|
@ -88,14 +86,13 @@ public class PickSequenomProbes extends RodWalker<String, String> {
|
|||
logger.info("Loading SNP mask... ");
|
||||
ReferenceOrderedData snp_mask;
|
||||
if ( SNP_MASK.contains(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME)) {
|
||||
TribbleRMDTrackBuilder builder = new TribbleRMDTrackBuilder();
|
||||
RMDTrackBuilder builder = new RMDTrackBuilder();
|
||||
CloseableIterator<GATKFeature> iter = builder.createInstanceOfTrack(DbSNPCodec.class,"snp_mask",new java.io.File(SNP_MASK)).getIterator();
|
||||
snpMaskIterator = new SeekableRODIterator(iter);
|
||||
|
||||
} else {
|
||||
snp_mask = new ReferenceOrderedData<TabularROD>("snp_mask",
|
||||
new java.io.File(SNP_MASK), TabularROD.class);
|
||||
snpMaskIterator = new SeekableRODIterator(new GATKFeatureIterator(snp_mask.iterator()));
|
||||
// TODO: fix me when Plink is back
|
||||
throw new IllegalArgumentException("We currently do not support other snp_mask tracks (like Plink)");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -32,7 +32,6 @@ import org.broad.tribble.Feature;
|
|||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
|
||||
import org.broadinstitute.sting.gatk.refdata.PlinkRod;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
|
|
@ -208,10 +207,6 @@ public class SequenomValidationConverter extends RodWalker<Pair<VariantContext,
|
|||
infoMap.put(VCFConstants.ALLELE_COUNT_KEY, String.format("%d", altAlleleCount));
|
||||
infoMap.put(VCFConstants.ALLELE_NUMBER_KEY, String.format("%d", vContext.getChromosomeCount()));
|
||||
|
||||
// set the id if it's a plink rod
|
||||
if ( rod instanceof PlinkRod )
|
||||
infoMap.put(VariantContext.ID_KEY, ((PlinkRod)rod).getVariantName());
|
||||
|
||||
vContext = VariantContextUtils.modifyAttributes(vContext, infoMap);
|
||||
|
||||
return new Pair<VariantContext, Byte>(vContext, ref.getBase());
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@ import org.broadinstitute.sting.commandline.Output;
|
|||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.builders.TribbleRMDTrackBuilder;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder;
|
||||
import org.broadinstitute.sting.gatk.walkers.By;
|
||||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
|
|
@ -47,7 +47,7 @@ public class DbSNPWindowCounter extends LocusWalker<Integer, Long> {
|
|||
|
||||
|
||||
public void initialize() {
|
||||
TribbleRMDTrackBuilder builder = new TribbleRMDTrackBuilder();
|
||||
RMDTrackBuilder builder = new RMDTrackBuilder();
|
||||
reader = builder.createFeatureReader(DbSNPCodec.class,myDbSNPFile).first;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -47,7 +47,7 @@ public class DesignFileGeneratorWalker extends RodWalker<Long,Long> {
|
|||
|
||||
if ( intervalsList != null && intervalsList.size() > 0 ) {
|
||||
for ( Object interval : intervalsList ) {
|
||||
GenomeLoc loc = ((IntervalRod) interval).getLocation();
|
||||
GenomeLoc loc = ((GATKFeature)interval).getLocation();
|
||||
if ( ! intervalBuffer.keySet().contains(loc) ) {
|
||||
intervalBuffer.put(loc,new IntervalInfoBuilder());
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,8 +2,6 @@ package org.broadinstitute.sting.oneoffprojects.walkers;
|
|||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.IntervalRod;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
|
||||
import org.broadinstitute.sting.gatk.walkers.RefWalker;
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils
|
|||
import org.broadinstitute.sting.gatk.refdata.*;
|
||||
import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqCodec;
|
||||
import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqFeature;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.builders.TribbleRMDTrackBuilder;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.FeatureToGATKFeatureIterator;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
|
|
@ -36,7 +36,7 @@ public class IndelAnnotator extends RodWalker<Integer,Long> {
|
|||
|
||||
public void initialize() {
|
||||
if ( RefseqFileName != null ) {
|
||||
TribbleRMDTrackBuilder builder = new TribbleRMDTrackBuilder();
|
||||
RMDTrackBuilder builder = new RMDTrackBuilder();
|
||||
FeatureSource refseq = builder.createFeatureReader(RefSeqCodec.class,new File(RefseqFileName)).first;
|
||||
|
||||
try {
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ import org.broad.tribble.util.variantcontext.VariantContext;
|
|||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvaluator;
|
||||
import org.broadinstitute.sting.playground.utils.report.tags.DataPoint;
|
||||
import org.broadinstitute.sting.playground.utils.report.utils.TableType;
|
||||
|
||||
|
|
|
|||
|
|
@ -1,192 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.hybridselection;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.IntervalRod;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.By;
|
||||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.io.PrintStream;
|
||||
|
||||
/**
|
||||
* Accumulates coverage across hybrid selection bait intervals to assess effect of bait adjacency and overlap on coverage
|
||||
*/
|
||||
|
||||
@By(DataSource.REFERENCE)
|
||||
public class CoverageAcrossBaitsWalker extends LocusWalker<Pair<Integer, Integer>, CoverageAcrossBaitsWalker.Coverage> {
|
||||
@Output
|
||||
public PrintStream out;
|
||||
|
||||
/* Accumulates coverage across hybrid selection bait intervals to assess effect of bait adjacency.
|
||||
Requires input bait intervals that have an overhang beyond the actual bait interval to capture coverage data at these points.
|
||||
Outputs R parseable file that has all data in lists and then does some basic plotting.
|
||||
*/
|
||||
@Argument(fullName="include_duplicates", shortName="idup", required=false, doc="consider duplicate reads")
|
||||
public boolean INCLUDE_DUPLICATE_READS = false;
|
||||
|
||||
@Argument(fullName="min_mapq", shortName="mmq", required=false, doc="Minimum mapping quality of reads to consider")
|
||||
public Integer MIN_MAPQ = 1;
|
||||
|
||||
@Argument(fullName="min_len", shortName="min", required=false, doc="Minimum length of interval to consider")
|
||||
public Integer MIN_LEN = Integer.MIN_VALUE;
|
||||
|
||||
@Argument(fullName="max_len", shortName="max", required=false, doc="Maximum length of interval to consider")
|
||||
public Integer MAX_LEN = 5000;
|
||||
|
||||
@Argument(fullName="max_bait_count", shortName="mbc", required=false, doc="Maximum number of baits to consider")
|
||||
public Integer MAX_BAIT_COUNT = 24;
|
||||
|
||||
@Argument(fullName="free_standing_distance", shortName="fsd", required=false, doc="minimum distance to next interval to consider freestanding")
|
||||
public Integer FREE_STANDING_DISTANCE = 500;
|
||||
|
||||
public static class Coverage {
|
||||
// Container for carrying positive and negative strand coverage
|
||||
public ArrayList<Integer> pos = new ArrayList<Integer>();
|
||||
public ArrayList<Integer> neg = new ArrayList<Integer>();
|
||||
public void addCoveragePoints(Pair<Integer,Integer> pn) {
|
||||
pos.add(pn.first);
|
||||
neg.add(pn.second);
|
||||
}
|
||||
}
|
||||
|
||||
public Pair<Integer, Integer> map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
List<SAMRecord> reads = context.getReads();
|
||||
IntervalRod intervalROD = tracker.lookup("interval",IntervalRod.class);
|
||||
|
||||
GenomeLoc interval = intervalROD == null ? null : intervalROD.getLocation();
|
||||
if (interval == null) { throw new ReviewedStingException("No intervals at locus; should not happen"); }
|
||||
int offset = (int)(context.getPosition() - interval.getStart());
|
||||
|
||||
int depth[] = new int[2];
|
||||
for ( int i = 0; i < reads.size(); i++ )
|
||||
{
|
||||
SAMRecord read = reads.get(i);
|
||||
|
||||
if (read.getNotPrimaryAlignmentFlag()) { continue; }
|
||||
if (read.getReadUnmappedFlag()) { continue; }
|
||||
if (!INCLUDE_DUPLICATE_READS && read.getDuplicateReadFlag()) { continue; }
|
||||
if (read.getMappingQuality() < MIN_MAPQ) { continue; }
|
||||
|
||||
depth[read.getReadNegativeStrandFlag() ? 0 : 1]++;
|
||||
}
|
||||
|
||||
return new Pair<Integer,Integer>(depth[0],depth[1]);
|
||||
}
|
||||
|
||||
public Coverage reduceInit() { return new Coverage(); }
|
||||
public Coverage reduce(Pair<Integer, Integer> depths, Coverage cov) {
|
||||
cov.addCoveragePoints(depths);
|
||||
return cov;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true if your walker wants to reduce each interval separately. Default is false.
|
||||
* If you set this flag, several things will happen.
|
||||
* The system will invoke reduceInit() once for each interval being processed, starting a fresh reduce
|
||||
* Reduce will accumulate normally at each map unit in the interval
|
||||
* However, onTraversalDone(reduce) will be called after each interval is processed.
|
||||
* The system will call onTraversalDone( GenomeLoc -> reduce ), after all reductions are done,
|
||||
* which is overloaded here to call onTraversalDone(reduce) for each location
|
||||
*/
|
||||
public boolean isReduceByInterval() {
|
||||
return true;
|
||||
}
|
||||
|
||||
public void onTraversalDone(List<Pair<GenomeLoc, Coverage>> results) {
|
||||
int TOTAL_BAIT_WINDOW = MAX_BAIT_COUNT*120+FREE_STANDING_DISTANCE*2;
|
||||
int depths[][][] = new int[2][MAX_BAIT_COUNT+1][TOTAL_BAIT_WINDOW]; // Indices: pos/neg strand count, bait count, position in bait
|
||||
int bait_count[] = new int[MAX_BAIT_COUNT+1];
|
||||
|
||||
for (Pair<GenomeLoc, Coverage> pair : results) {
|
||||
GenomeLoc loc = pair.first;
|
||||
//if (loc.size() < MIN_LEN || loc.size() > MAX_LEN) { continue; }
|
||||
|
||||
long interval_width = loc.size() - FREE_STANDING_DISTANCE * 2;
|
||||
int baits = (int)(interval_width / 120);
|
||||
//out.format("#Interval_width: %d Bait_count: %d\n", interval_width, baits);
|
||||
if (interval_width % 120 != 0) { continue; }
|
||||
if (interval_width > MAX_BAIT_COUNT*120) { continue; }
|
||||
|
||||
// This target is good; count it
|
||||
bait_count[baits]++;
|
||||
Coverage pn_cov = pair.second;
|
||||
for (int pn=0; pn<2; pn++) {
|
||||
ArrayList<Integer> cov = pn == 0 ? pn_cov.neg : pn_cov.pos;
|
||||
for (int i=0; i<cov.size(); i++) {
|
||||
depths[pn][baits][i] += cov.get(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Write out the R script that plots all of the coverage across intervals
|
||||
// All of the data is contained in variables b? where ? corresponds
|
||||
// to
|
||||
Boolean plot_begun = false;
|
||||
out.format("pcounts <- NULL\n");
|
||||
out.format("ncounts <- NULL\n");
|
||||
for (int baits=MAX_BAIT_COUNT; baits>0; baits--) {
|
||||
int norm_depth_width = baits*120+FREE_STANDING_DISTANCE*2;
|
||||
float norm_depths[] = new float[norm_depth_width];
|
||||
for (int pn=0; pn<2; pn++) {
|
||||
for (int i=0; i<norm_depth_width; i++) {
|
||||
norm_depths[i] = (float)depths[pn][baits][i]/bait_count[baits];
|
||||
}
|
||||
//out.format("Baits/target: %s\n", baits);
|
||||
//out.format("Depths: %s\n", Arrays.toString(norm_depths));
|
||||
//out.format("Number of baits: %d\n", bait_count[baits]);
|
||||
char pn_char = pn == 0 ? 'n' : 'p';
|
||||
String pn_color = pn == 0 ? "black" : "red";
|
||||
if (bait_count[baits] > 0) {
|
||||
String depth_str = Arrays.toString(norm_depths);
|
||||
depth_str = depth_str.substring(1,depth_str.length()-1);
|
||||
out.format("b%c%d <- c(%s)\n", pn_char, baits, depth_str);
|
||||
if (plot_begun) {
|
||||
out.format("points(b%c%d, type='l', col=\"%s\")\n", pn_char, baits, pn_color);
|
||||
}else{
|
||||
out.format("plot(b%c%d, type='l', col=\"%s\", xlab='Position from start of 1st bait', ylab='Coverage')\n", pn_char, baits, pn_color);
|
||||
plot_begun = true;
|
||||
}
|
||||
}
|
||||
out.format("%ccounts <- c(%ccounts,%d)\n", pn_char, pn_char, bait_count[baits]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -1,320 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.hybridselection;
|
||||
|
||||
import net.sf.picard.reference.ReferenceSequence;
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import net.sf.picard.util.Interval;
|
||||
import net.sf.picard.util.IntervalList;
|
||||
import net.sf.picard.util.OverlapDetector;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.util.StringUtil;
|
||||
import org.broad.tribble.FeatureSource;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.SeekableRODIterator;
|
||||
import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqCodec;
|
||||
import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqFeature;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.builders.TribbleRMDTrackBuilder;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.By;
|
||||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.PrintStream;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Given intervals corresponding to targets or baits in a hybrid selection experiment, this walker gives the following interval-by-interval data:
|
||||
* coverage, %GC, corresponding gene name, bait quantity, number of adjacent baits, and whether the bait is boosted.
|
||||
* It can be useful to manually disable the merging of intervals in the GATK when using this walker; this feature is not yet part of the GATK.
|
||||
*/
|
||||
@By(DataSource.REFERENCE)
|
||||
public class HybSelPerformanceWalker extends LocusWalker<Integer, HybSelPerformanceWalker.TargetInfo> implements TreeReducible<HybSelPerformanceWalker.TargetInfo> {
|
||||
@Output
|
||||
public PrintStream out;
|
||||
|
||||
@Argument(fullName="min_mapq", shortName="mmq", required=false, doc="Minimum mapping quality of reads to consider")
|
||||
public Integer MIN_MAPQ = 1;
|
||||
|
||||
@Argument(fullName="min_baseq", shortName="mbq", required=false, doc="Minimum base call quality of reads to consider (default: 0)")
|
||||
public Integer MIN_BASEQ = 0;
|
||||
|
||||
@Argument(fullName="include_duplicates", shortName="idup", required=false, doc="consider duplicate reads")
|
||||
public boolean INCLUDE_DUPLICATE_READS = false;
|
||||
|
||||
@Argument(fullName="free_standing_distance", shortName="fsd", required=false, doc="minimum distance to next interval to consider freestanding")
|
||||
public Integer FREE_STANDING_DISTANCE = 500;
|
||||
|
||||
@Argument(fullName="booster", required=false, doc="interval list of booster baits")
|
||||
public File BOOSTER_FILE;
|
||||
|
||||
@Argument(fullName="booster_distance", required=false, doc="distance up to which a booster can affect a target")
|
||||
public Integer BOOSTER_DISTANCE = 100; // how far away can a booster be to "hit" its target?
|
||||
|
||||
@Argument(fullName="bait_quantity", required=false, doc="interval list of baits with quantity/concentration (obtainied via sequencing, Nanostring)")
|
||||
public File BAIT_QUANT_FILE;
|
||||
|
||||
@Argument(fullName="refseq", shortName="refseq",
|
||||
doc="Name of RefSeq transcript annotation file. If specified, intervals will be specified with gene names", required=false)
|
||||
String REFSEQ_FILE = null;
|
||||
|
||||
private LocationAwareSeekableRODIterator refseqIterator=null;
|
||||
|
||||
public static class TargetInfo {
|
||||
public int counts = 0;
|
||||
|
||||
// did at least two reads hit this target
|
||||
public boolean hitTwice = false;
|
||||
|
||||
public int positionsOver2x = 0;
|
||||
public int positionsOver8x = 0;
|
||||
public int positionsOver10x = 0;
|
||||
public int positionsOver20x = 0;
|
||||
public int positionsOver30x = 0;
|
||||
}
|
||||
|
||||
// @Argument(fullName="suppressLocusPrinting",required=false,defaultValue="false")
|
||||
// public boolean suppressPrinting;
|
||||
|
||||
@Override
|
||||
public void initialize() {
|
||||
if ( REFSEQ_FILE != null ) {
|
||||
TribbleRMDTrackBuilder builder = new TribbleRMDTrackBuilder();
|
||||
FeatureSource refseq = builder.createFeatureReader(RefSeqCodec.class, new File(REFSEQ_FILE)).first;
|
||||
|
||||
try {
|
||||
refseqIterator = new SeekableRODIterator(new FeatureToGATKFeatureIterator(refseq.iterator(), "refseq"));
|
||||
} catch (IOException e) {
|
||||
throw new UserException.CouldNotReadInputFile(REFSEQ_FILE, e);
|
||||
}
|
||||
|
||||
logger.info("Using RefSeq annotations from "+REFSEQ_FILE);
|
||||
}
|
||||
|
||||
if ( refseqIterator == null ) logger.info("No annotations available");
|
||||
|
||||
}
|
||||
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
List<SAMRecord> reads = context.getReads();
|
||||
List<Integer> offsets = context.getOffsets();
|
||||
|
||||
int depth = 0;
|
||||
for ( int i = 0; i < reads.size(); i++ )
|
||||
{
|
||||
SAMRecord read = reads.get(i);
|
||||
|
||||
if (read.getNotPrimaryAlignmentFlag()) { continue; }
|
||||
if (read.getReadUnmappedFlag()) { continue; }
|
||||
if (!INCLUDE_DUPLICATE_READS && read.getDuplicateReadFlag()) { continue; }
|
||||
if (read.getMappingQuality() < MIN_MAPQ) { continue; }
|
||||
if (read.getBaseQualities()[offsets.get(i)] < MIN_BASEQ) { continue; }
|
||||
depth++;
|
||||
}
|
||||
|
||||
return depth;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true if your walker wants to reduce each interval separately. Default is false.
|
||||
*
|
||||
* If you set this flag, several things will happen.
|
||||
*
|
||||
* The system will invoke reduceInit() once for each interval being processed, starting a fresh reduce
|
||||
* Reduce will accumulate normally at each map unit in the interval
|
||||
* However, onTraversalDone(reduce) will be called after each interval is processed.
|
||||
* The system will call onTraversalDone( GenomeLoc -> reduce ), after all reductions are done,
|
||||
* which is overloaded here to call onTraversalDone(reduce) for each location
|
||||
*/
|
||||
public boolean isReduceByInterval() {
|
||||
return true;
|
||||
}
|
||||
|
||||
public TargetInfo reduceInit() { return new TargetInfo(); }
|
||||
|
||||
public TargetInfo reduce(Integer depth, TargetInfo sum) {
|
||||
sum.counts += depth;
|
||||
if (depth >= 2) { sum.hitTwice = true; sum.positionsOver2x++;}
|
||||
if (depth >= 8) { sum.positionsOver8x++; }
|
||||
if (depth >= 10) { sum.positionsOver10x++; }
|
||||
if (depth >= 20) { sum.positionsOver20x++; }
|
||||
if (depth >= 30) { sum.positionsOver30x++; }
|
||||
return sum;
|
||||
}
|
||||
|
||||
public TargetInfo treeReduce(TargetInfo lhs, TargetInfo rhs) {
|
||||
return lhs; // dummy function to test if tree reduce by interval is working
|
||||
}
|
||||
|
||||
public void onTraversalDone(TargetInfo result) {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onTraversalDone(List<Pair<GenomeLoc, TargetInfo>> results) {
|
||||
out.println("location\tlength\tgc\tavg_coverage\tnormalized_coverage\thit_twice\tfreestanding\tboosted\tbases_over_2x\tbases_over_8x\tbases_over_10x\tbases_over_20x\tbases_over_30x\tgene_name\tbait_quantity\tadjacent_baits");
|
||||
|
||||
// first zip through and build an overlap detector of all the intervals, so later
|
||||
// we can calculate if this interval is free-standing
|
||||
OverlapDetector<Interval> od = new OverlapDetector<Interval>(-FREE_STANDING_DISTANCE,0);
|
||||
for(Pair<GenomeLoc, TargetInfo> pair : results) {
|
||||
GenomeLoc target = pair.getFirst();
|
||||
Interval interval = makeInterval(target);
|
||||
od.addLhs(interval, interval);
|
||||
}
|
||||
|
||||
OverlapDetector<Interval> booster = new OverlapDetector<Interval>(-BOOSTER_DISTANCE,0);
|
||||
if (BOOSTER_FILE != null) {
|
||||
IntervalList il = IntervalList.fromFile(BOOSTER_FILE);
|
||||
List<Interval> l = il.getUniqueIntervals();
|
||||
booster.addAll(l, l);
|
||||
}
|
||||
|
||||
// Create a interval detector that will give us the bait quantity specified in
|
||||
// an optional bait quantity file
|
||||
OverlapDetector<Interval> bait_quant = new OverlapDetector<Interval>(0,0);
|
||||
if (BAIT_QUANT_FILE != null) {
|
||||
IntervalList il = IntervalList.fromFile(BAIT_QUANT_FILE);
|
||||
List<Interval> l = il.getIntervals();
|
||||
bait_quant.addAll(l, l);
|
||||
}
|
||||
|
||||
// Create a detector of adjacent baits
|
||||
OverlapDetector<Interval> adjacent_bait_detector = new OverlapDetector<Interval>(-1,-1);
|
||||
for(Pair<GenomeLoc, TargetInfo> pair : results) {
|
||||
GenomeLoc target = pair.getFirst();
|
||||
Interval interval = makeInterval(target);
|
||||
adjacent_bait_detector.addLhs(interval, interval);
|
||||
}
|
||||
|
||||
// now zip through and calculate the total average coverage
|
||||
long totalCoverage = 0;
|
||||
long basesConsidered = 0;
|
||||
for(Pair<GenomeLoc, TargetInfo> pair : results) {
|
||||
GenomeLoc target = pair.getFirst();
|
||||
TargetInfo ti = pair.getSecond();
|
||||
|
||||
// as long as it was hit twice, count it
|
||||
if(ti.hitTwice) {
|
||||
long length = target.getStop() - target.getStart() + 1;
|
||||
totalCoverage += ti.counts;
|
||||
basesConsidered += length;
|
||||
}
|
||||
}
|
||||
double meanTargetCoverage = (1.0*totalCoverage) / basesConsidered;
|
||||
|
||||
|
||||
for(Pair<GenomeLoc, TargetInfo> pair : results) {
|
||||
GenomeLoc target = pair.getFirst();
|
||||
TargetInfo ti = pair.getSecond();
|
||||
long length = target.getStop() - target.getStart() + 1;
|
||||
|
||||
double avgCoverage = ((double)ti.counts / (double)length);
|
||||
double normCoverage = avgCoverage / meanTargetCoverage;
|
||||
|
||||
// calculate gc for the target
|
||||
double gc = calculateGC(target);
|
||||
|
||||
// if there is more than one hit on the overlap detector, it's not freestanding
|
||||
Interval targetInterval = makeInterval(target);
|
||||
|
||||
Collection<Interval> hits = od.getOverlaps(targetInterval);
|
||||
boolean freestanding = (hits.size() == 1);
|
||||
|
||||
boolean boosted = (booster.getOverlaps(targetInterval).size() > 0);
|
||||
|
||||
// look up the gene name info
|
||||
String geneName = getGeneName(target);
|
||||
|
||||
Collection<Interval> bait_quant_hits = bait_quant.getOverlaps(targetInterval);
|
||||
String bait_quant_string = (bait_quant_hits.size() == 1) ? bait_quant_hits.iterator().next().getName() : "0";
|
||||
if (bait_quant_hits.size() > 1) { out.printf("Warning: multiple bait quantity intervals detected; perhaps bait quantity interval lengths don't match primary interval list specified with -L\n"); }
|
||||
int adjacent_baits = adjacent_bait_detector.getOverlaps(targetInterval).size() - 1;
|
||||
|
||||
out.printf("%s:%d-%d\t%d\t%6.4f\t%6.4f\t%6.4f\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%d\n",
|
||||
target.getContig(), target.getStart(), target.getStop(), length, gc,
|
||||
avgCoverage, normCoverage, ((ti.hitTwice)?1:0), ((freestanding)?1:0), ((boosted)?1:0),
|
||||
ti.positionsOver2x,
|
||||
ti.positionsOver8x,
|
||||
ti.positionsOver10x,
|
||||
ti.positionsOver20x,
|
||||
ti.positionsOver30x,
|
||||
geneName,
|
||||
bait_quant_string,
|
||||
adjacent_baits
|
||||
);
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
private Interval makeInterval(GenomeLoc target) {
|
||||
return new Interval(target.getContig(), (int) target.getStart(), (int) target.getStop());
|
||||
}
|
||||
|
||||
private String getGeneName(GenomeLoc target) {
|
||||
if (refseqIterator == null) { return "UNKNOWN"; }
|
||||
|
||||
RODRecordList annotationList = refseqIterator.seekForward(target);
|
||||
if (annotationList == null) { return "UNKNOWN"; }
|
||||
|
||||
for(GATKFeature rec : annotationList) {
|
||||
if ( ((RefSeqFeature)rec.getUnderlyingObject()).overlapsExonP(target) ) {
|
||||
return ((RefSeqFeature)rec.getUnderlyingObject()).getGeneName();
|
||||
}
|
||||
}
|
||||
|
||||
return "UNKNOWN";
|
||||
|
||||
}
|
||||
|
||||
IndexedFastaSequenceFile seqFile = null;
|
||||
|
||||
private double calculateGC(GenomeLoc target) {
|
||||
if (seqFile == null) {
|
||||
seqFile = new IndexedFastaSequenceFile(getToolkit().getArguments().referenceFile);
|
||||
}
|
||||
ReferenceSequence refSeq = seqFile.getSubsequenceAt(target.getContig(),target.getStart(), target.getStop());
|
||||
|
||||
|
||||
int gcCount = 0;
|
||||
for(char base : StringUtil.bytesToString(refSeq.getBases()).toCharArray()) {
|
||||
if (base == 'C' || base == 'c' || base == 'G' || base == 'g') { gcCount++; }
|
||||
}
|
||||
return ( (double) gcCount ) / ((double) refSeq.getBases().length);
|
||||
|
||||
}
|
||||
}
|
||||
|
|
@ -1,7 +1,5 @@
|
|||
package org.broadinstitute.sting.playground.utils.report.utils;
|
||||
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -1,9 +1,7 @@
|
|||
package org.broadinstitute.sting.playground.utils.report.utils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -37,7 +37,7 @@ import org.broadinstitute.sting.gatk.io.stubs.VCFWriterArgumentTypeDescriptor;
|
|||
import org.broadinstitute.sting.gatk.io.stubs.OutputStreamArgumentTypeDescriptor;
|
||||
import org.broadinstitute.sting.gatk.io.stubs.SAMFileReaderArgumentTypeDescriptor;
|
||||
import org.broadinstitute.sting.gatk.io.stubs.SAMFileWriterArgumentTypeDescriptor;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackManager;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
|
|
@ -65,7 +65,7 @@ public class GATKExtensionsGenerator extends CommandLineProgram {
|
|||
GenomeAnalysisEngine GATKEngine = new GenomeAnalysisEngine();
|
||||
WalkerManager walkerManager = new WalkerManager();
|
||||
FilterManager filterManager = new FilterManager();
|
||||
RMDTrackManager rmdTrackManager = new RMDTrackManager();
|
||||
RMDTrackBuilder trackBuilder = new RMDTrackBuilder();
|
||||
|
||||
/**
|
||||
* Required main method implementation.
|
||||
|
|
@ -113,7 +113,7 @@ public class GATKExtensionsGenerator extends CommandLineProgram {
|
|||
List<ArgumentField> argumentFields = new ArrayList<ArgumentField>();
|
||||
|
||||
argumentFields.addAll(ArgumentDefinitionField.getArgumentFields(walkerType));
|
||||
argumentFields.addAll(RodBindField.getRodArguments(walkerType, rmdTrackManager));
|
||||
argumentFields.addAll(RodBindField.getRodArguments(walkerType, trackBuilder));
|
||||
argumentFields.addAll(ReadFilterField.getFilterArguments(walkerType));
|
||||
|
||||
writeClass(COMMANDLINE_PACKAGE_NAME + "." + clpClassName, WALKER_PACKAGE_NAME,
|
||||
|
|
|
|||
|
|
@ -26,7 +26,7 @@ package org.broadinstitute.sting.queue.extensions.gatk;
|
|||
|
||||
import org.broadinstitute.sting.commandline.Input;
|
||||
import org.broadinstitute.sting.gatk.WalkerManager;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackManager;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder;
|
||||
import org.broadinstitute.sting.gatk.walkers.RMD;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
|
||||
|
|
@ -88,7 +88,7 @@ public class RodBindField extends ArgumentField {
|
|||
return exclusiveOf.toString();
|
||||
}
|
||||
|
||||
public static List<ArgumentField> getRodArguments(Class<? extends Walker> walkerClass, RMDTrackManager rmdTrackManager) {
|
||||
public static List<ArgumentField> getRodArguments(Class<? extends Walker> walkerClass, RMDTrackBuilder trackBuilder) {
|
||||
List<ArgumentField> argumentFields = new ArrayList<ArgumentField>();
|
||||
|
||||
List<RMD> requires = WalkerManager.getRequiredMetaData(walkerClass);
|
||||
|
|
@ -101,7 +101,7 @@ public class RodBindField extends ArgumentField {
|
|||
// TODO: Add the field triplet for name=* after @Allows and @Requires are fixed on walkers
|
||||
//fields.add(new RodBindArgumentField(argumentDefinition, true));
|
||||
} else {
|
||||
for (String typeName: rmdTrackManager.getTrackRecordTypeNames(required.type()))
|
||||
for (String typeName: trackBuilder.getTrackRecordTypeNames(required.type()))
|
||||
fields.add(new RodBindField(trackName, typeName, fields, true));
|
||||
}
|
||||
argumentFields.addAll(fields);
|
||||
|
|
@ -114,7 +114,7 @@ public class RodBindField extends ArgumentField {
|
|||
// TODO: Add the field triplet for name=* after @Allows and @Requires are fixed on walkers
|
||||
//fields.add(new RodBindArgumentField(argumentDefinition, false));
|
||||
} else {
|
||||
for (String typeName: rmdTrackManager.getTrackRecordTypeNames(allowed.type()))
|
||||
for (String typeName: trackBuilder.getTrackRecordTypeNames(allowed.type()))
|
||||
fields.add(new RodBindField(trackName, typeName, fields, true));
|
||||
}
|
||||
argumentFields.addAll(fields);
|
||||
|
|
|
|||
|
|
@ -1,14 +1,15 @@
|
|||
package org.broadinstitute.sting.gatk.datasources.providers;
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.datasources.shards.LocusShard;
|
||||
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
|
||||
import org.broadinstitute.sting.gatk.datasources.shards.MockLocusShard;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||
import org.broadinstitute.sting.gatk.refdata.TabularROD;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.RODRMDTrack;
|
||||
import org.broadinstitute.sting.gatk.refdata.features.table.TableCodec;
|
||||
import org.broadinstitute.sting.gatk.refdata.features.table.TableFeature;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.junit.Assert;
|
||||
import org.junit.BeforeClass;
|
||||
|
|
@ -43,6 +44,11 @@ public class ReferenceOrderedViewUnitTest extends BaseTest {
|
|||
*/
|
||||
private static IndexedFastaSequenceFile seq;
|
||||
|
||||
/**
|
||||
* our track builder
|
||||
*/
|
||||
RMDTrackBuilder builder = new RMDTrackBuilder();
|
||||
|
||||
@BeforeClass
|
||||
public static void init() throws FileNotFoundException {
|
||||
// sequence
|
||||
|
|
@ -69,8 +75,8 @@ public class ReferenceOrderedViewUnitTest extends BaseTest {
|
|||
@Test
|
||||
public void testSingleBinding() {
|
||||
File file = new File(testDir + "TabularDataTest.dat");
|
||||
ReferenceOrderedData rod = new ReferenceOrderedData("tableTest", file, TabularROD.class);
|
||||
ReferenceOrderedDataSource dataSource = new ReferenceOrderedDataSource(null, new RODRMDTrack(TabularROD.class,"tableTest",file,rod));
|
||||
RMDTrack track = builder.createInstanceOfTrack(TableCodec.class,"tableTest",file);
|
||||
ReferenceOrderedDataSource dataSource = new ReferenceOrderedDataSource(null,track);
|
||||
|
||||
Shard shard = new MockLocusShard(Collections.singletonList(GenomeLocParser.createGenomeLoc("chrM",1,30)));
|
||||
|
||||
|
|
@ -78,7 +84,7 @@ public class ReferenceOrderedViewUnitTest extends BaseTest {
|
|||
ReferenceOrderedView view = new ManagingReferenceOrderedView( provider );
|
||||
|
||||
RefMetaDataTracker tracker = view.getReferenceOrderedDataAtLocus(GenomeLocParser.createGenomeLoc("chrM",20));
|
||||
TabularROD datum = tracker.lookup("tableTest",TabularROD.class);
|
||||
TableFeature datum = tracker.lookup("tableTest",TableFeature.class);
|
||||
|
||||
Assert.assertEquals("datum parameter for COL1 is incorrect", "C", datum.get("COL1"));
|
||||
Assert.assertEquals("datum parameter for COL2 is incorrect", "D", datum.get("COL2"));
|
||||
|
|
@ -92,10 +98,11 @@ public class ReferenceOrderedViewUnitTest extends BaseTest {
|
|||
public void testMultipleBinding() {
|
||||
File file = new File(testDir + "TabularDataTest.dat");
|
||||
|
||||
ReferenceOrderedData rod1 = new ReferenceOrderedData("tableTest1", file, TabularROD.class);
|
||||
ReferenceOrderedDataSource dataSource1 = new ReferenceOrderedDataSource(null,new RODRMDTrack(TabularROD.class,"tableTest1",file,rod1));
|
||||
ReferenceOrderedData rod2 = new ReferenceOrderedData("tableTest2", file, TabularROD.class);
|
||||
ReferenceOrderedDataSource dataSource2 = new ReferenceOrderedDataSource(null,new RODRMDTrack(TabularROD.class,"tableTest2",file,rod2));;
|
||||
|
||||
RMDTrack track = builder.createInstanceOfTrack(TableCodec.class,"tableTest1",file);
|
||||
ReferenceOrderedDataSource dataSource1 = new ReferenceOrderedDataSource(null,track);
|
||||
RMDTrack track2 = builder.createInstanceOfTrack(TableCodec.class,"tableTest2",file);
|
||||
ReferenceOrderedDataSource dataSource2 = new ReferenceOrderedDataSource(null,track2);
|
||||
|
||||
|
||||
Shard shard = new MockLocusShard(Collections.singletonList(GenomeLocParser.createGenomeLoc("chrM",1,30)));
|
||||
|
|
@ -104,13 +111,13 @@ public class ReferenceOrderedViewUnitTest extends BaseTest {
|
|||
ReferenceOrderedView view = new ManagingReferenceOrderedView( provider );
|
||||
|
||||
RefMetaDataTracker tracker = view.getReferenceOrderedDataAtLocus(GenomeLocParser.createGenomeLoc("chrM",20));
|
||||
TabularROD datum1 = tracker.lookup("tableTest1",TabularROD.class);
|
||||
TableFeature datum1 = tracker.lookup("tableTest1",TableFeature.class);
|
||||
|
||||
Assert.assertEquals("datum1 parameter for COL1 is incorrect", "C", datum1.get("COL1"));
|
||||
Assert.assertEquals("datum1 parameter for COL2 is incorrect", "D", datum1.get("COL2"));
|
||||
Assert.assertEquals("datum1 parameter for COL3 is incorrect", "E", datum1.get("COL3"));
|
||||
|
||||
TabularROD datum2 = tracker.lookup("tableTest2",TabularROD.class);
|
||||
TableFeature datum2 = tracker.lookup("tableTest2", TableFeature.class);
|
||||
|
||||
Assert.assertEquals("datum2 parameter for COL1 is incorrect", "C", datum2.get("COL1"));
|
||||
Assert.assertEquals("datum2 parameter for COL2 is incorrect", "D", datum2.get("COL2"));
|
||||
|
|
|
|||
|
|
@ -2,9 +2,10 @@ package org.broadinstitute.sting.gatk.datasources.simpleDataSources;
|
|||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||
import org.broadinstitute.sting.gatk.refdata.TabularROD;
|
||||
import org.broadinstitute.sting.gatk.refdata.features.table.TableCodec;
|
||||
import org.broadinstitute.sting.gatk.refdata.features.table.TableFeature;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.RODRMDTrack;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
|
|
@ -47,13 +48,13 @@ public class ReferenceOrderedDataPoolUnitTest extends BaseTest {
|
|||
public static void init() throws FileNotFoundException {
|
||||
File sequenceFile = new File(hg18Reference);
|
||||
GenomeLocParser.setupRefContigOrdering(new IndexedFastaSequenceFile(sequenceFile));
|
||||
TabularROD.setDelimiter(TabularROD.DEFAULT_DELIMITER, TabularROD.DEFAULT_DELIMITER_REGEX);
|
||||
}
|
||||
|
||||
@Before
|
||||
public void setUp() {
|
||||
File file = new File(testDir + "TabularDataTest.dat");
|
||||
rod = new RODRMDTrack(TabularROD.class, "tableTest", file, new ReferenceOrderedData("tableTest", file, TabularROD.class));
|
||||
RMDTrackBuilder builder = new RMDTrackBuilder();
|
||||
rod = builder.createInstanceOfTrack(TableCodec.class, "tableTest", file);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
@ -64,7 +65,7 @@ public class ReferenceOrderedDataPoolUnitTest extends BaseTest {
|
|||
Assert.assertEquals("Number of iterators in the pool is incorrect", 1, iteratorPool.numIterators());
|
||||
Assert.assertEquals("Number of available iterators in the pool is incorrect", 0, iteratorPool.numAvailableIterators());
|
||||
|
||||
TabularROD datum = (TabularROD)iterator.next().get(0).getUnderlyingObject();
|
||||
TableFeature datum = (TableFeature)iterator.next().get(0).getUnderlyingObject();
|
||||
|
||||
assertTrue(datum.getLocation().equals(testSite1));
|
||||
assertTrue(datum.get("COL1").equals("A"));
|
||||
|
|
@ -90,26 +91,26 @@ public class ReferenceOrderedDataPoolUnitTest extends BaseTest {
|
|||
|
||||
// Test out-of-order access: first iterator2, then iterator1.
|
||||
// Ugh...first call to a region needs to be a seek.
|
||||
TabularROD datum = (TabularROD)iterator2.seekForward(testSite2).get(0).getUnderlyingObject();
|
||||
TableFeature datum = (TableFeature)iterator2.seekForward(testSite2).get(0).getUnderlyingObject();
|
||||
assertTrue(datum.getLocation().equals(testSite2));
|
||||
assertTrue(datum.get("COL1").equals("C"));
|
||||
assertTrue(datum.get("COL2").equals("D"));
|
||||
assertTrue(datum.get("COL3").equals("E"));
|
||||
|
||||
datum = (TabularROD)iterator1.next().get(0).getUnderlyingObject();
|
||||
datum = (TableFeature)iterator1.next().get(0).getUnderlyingObject();
|
||||
assertTrue(datum.getLocation().equals(testSite1));
|
||||
assertTrue(datum.get("COL1").equals("A"));
|
||||
assertTrue(datum.get("COL2").equals("B"));
|
||||
assertTrue(datum.get("COL3").equals("C"));
|
||||
|
||||
// Advance iterator2, and make sure both iterator's contents are still correct.
|
||||
datum = (TabularROD)iterator2.next().get(0).getUnderlyingObject();
|
||||
datum = (TableFeature)iterator2.next().get(0).getUnderlyingObject();
|
||||
assertTrue(datum.getLocation().equals(testSite3));
|
||||
assertTrue(datum.get("COL1").equals("F"));
|
||||
assertTrue(datum.get("COL2").equals("G"));
|
||||
assertTrue(datum.get("COL3").equals("H"));
|
||||
|
||||
datum = (TabularROD)iterator1.next().get(0).getUnderlyingObject();
|
||||
datum = (TableFeature)iterator1.next().get(0).getUnderlyingObject();
|
||||
assertTrue(datum.getLocation().equals(testSite2));
|
||||
assertTrue(datum.get("COL1").equals("C"));
|
||||
assertTrue(datum.get("COL2").equals("D"));
|
||||
|
|
@ -135,7 +136,7 @@ public class ReferenceOrderedDataPoolUnitTest extends BaseTest {
|
|||
Assert.assertEquals("Number of iterators in the pool is incorrect", 1, iteratorPool.numIterators());
|
||||
Assert.assertEquals("Number of available iterators in the pool is incorrect", 0, iteratorPool.numAvailableIterators());
|
||||
|
||||
TabularROD datum = (TabularROD)iterator.next().get(0).getUnderlyingObject();
|
||||
TableFeature datum = (TableFeature)iterator.next().get(0).getUnderlyingObject();
|
||||
assertTrue(datum.getLocation().equals(testSite1));
|
||||
assertTrue(datum.get("COL1").equals("A"));
|
||||
assertTrue(datum.get("COL2").equals("B"));
|
||||
|
|
@ -150,7 +151,7 @@ public class ReferenceOrderedDataPoolUnitTest extends BaseTest {
|
|||
Assert.assertEquals("Number of iterators in the pool is incorrect", 1, iteratorPool.numIterators());
|
||||
Assert.assertEquals("Number of available iterators in the pool is incorrect", 0, iteratorPool.numAvailableIterators());
|
||||
|
||||
datum = (TabularROD)iterator.seekForward(testSite3).get(0).getUnderlyingObject();
|
||||
datum = (TableFeature)iterator.seekForward(testSite3).get(0).getUnderlyingObject();
|
||||
assertTrue(datum.getLocation().equals(testSite3));
|
||||
assertTrue(datum.get("COL1").equals("F"));
|
||||
assertTrue(datum.get("COL2").equals("G"));
|
||||
|
|
@ -170,7 +171,7 @@ public class ReferenceOrderedDataPoolUnitTest extends BaseTest {
|
|||
Assert.assertEquals("Number of iterators in the pool is incorrect", 1, iteratorPool.numIterators());
|
||||
Assert.assertEquals("Number of available iterators in the pool is incorrect", 0, iteratorPool.numAvailableIterators());
|
||||
|
||||
TabularROD datum = (TabularROD)iterator.seekForward(testSite3).get(0).getUnderlyingObject();
|
||||
TableFeature datum = (TableFeature)iterator.seekForward(testSite3).get(0).getUnderlyingObject();
|
||||
assertTrue(datum.getLocation().equals(testSite3));
|
||||
assertTrue(datum.get("COL1").equals("F"));
|
||||
assertTrue(datum.get("COL2").equals("G"));
|
||||
|
|
@ -185,7 +186,7 @@ public class ReferenceOrderedDataPoolUnitTest extends BaseTest {
|
|||
Assert.assertEquals("Number of iterators in the pool is incorrect", 2, iteratorPool.numIterators());
|
||||
Assert.assertEquals("Number of available iterators in the pool is incorrect", 1, iteratorPool.numAvailableIterators());
|
||||
|
||||
datum = (TabularROD)iterator.next().get(0).getUnderlyingObject();
|
||||
datum = (TableFeature)iterator.next().get(0).getUnderlyingObject();
|
||||
assertTrue(datum.getLocation().equals(testSite1));
|
||||
assertTrue(datum.get("COL1").equals("A"));
|
||||
assertTrue(datum.get("COL2").equals("B"));
|
||||
|
|
|
|||
|
|
@ -1,247 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.refdata;
|
||||
/*
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.oneoffprojects.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.oneoffprojects.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
import org.junit.Assert;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.FileReader;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: chartl
|
||||
* Date: Jan 22, 2010
|
||||
* Time: 11:27:33 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*
|
||||
public class PlinkRodUnitTest extends BaseTest {
|
||||
// todo :: get the isIndel() isInsertion() and isDeletion() tests working again -- this may require new
|
||||
// todo :: methods in the objects themselves
|
||||
private static IndexedFastaSequenceFile seq;
|
||||
|
||||
@BeforeClass
|
||||
public static void beforeTests() {
|
||||
try {
|
||||
seq = new IndexedFastaSequenceFile(new File(b36KGReference));
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new StingException("unable to load the sequence dictionary");
|
||||
}
|
||||
GenomeLocParser.setupRefContigOrdering(seq);
|
||||
|
||||
}
|
||||
|
||||
public BufferedReader openFile(String filename) {
|
||||
try {
|
||||
return new BufferedReader(new FileReader(filename));
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new StingException("Couldn't open file " + filename);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testStandardPedFile() {
|
||||
PlinkRod rod = new PlinkRod("test");
|
||||
try {
|
||||
rod.initialize( new File("/humgen/gsa-hpprojects/GATK/data/Validation_Data/test/plink_rod_test/standard_plink_test.ped") );
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new StingException("test file for testStandardPedFile() does not exist",e);
|
||||
}
|
||||
|
||||
// test that the sample names are correct
|
||||
|
||||
List<String> rodNames = rod.getVariantSampleNames();
|
||||
List<String> expectedNames = Arrays.asList("NA12887","NAMELY","COWBA");
|
||||
|
||||
Assert.assertEquals("That there are as many samples in the rod as in the expected list",expectedNames.size(),rodNames.size());
|
||||
|
||||
boolean namesCorrect = true;
|
||||
for ( int i = 0; i < expectedNames.size(); i++ ) {
|
||||
namesCorrect = namesCorrect && ( rodNames.get(i).equals(expectedNames.get(i)) );
|
||||
}
|
||||
|
||||
Assert.assertTrue("That the names are correct and in the proper order",namesCorrect);
|
||||
|
||||
// test that rod can be iterated over
|
||||
|
||||
ArrayList<ArrayList<Genotype>> genotypesInRod = new ArrayList<ArrayList<Genotype>>();
|
||||
ArrayList<ArrayList<String>> sampleNamesInRod = new ArrayList<ArrayList<String>>();
|
||||
ArrayList<GenomeLoc> lociInRod = new ArrayList<GenomeLoc>();
|
||||
do {
|
||||
genotypesInRod.add(rod.getGenotypes());
|
||||
sampleNamesInRod.add(rod.getVariantSampleNames());
|
||||
lociInRod.add(rod.getLocation());
|
||||
} while ( rod.parseLine(null,null) );
|
||||
|
||||
Assert.assertEquals("That there are three SNPs in the ROD",3,genotypesInRod.size());
|
||||
|
||||
ArrayList<Genotype> snp1 = genotypesInRod.get(0);
|
||||
ArrayList<Genotype> snp3 = genotypesInRod.get(2);
|
||||
|
||||
Assert.assertEquals("That there are three Genotypes in SNP 1",3,snp1.size());
|
||||
Assert.assertEquals("That there are three samples in SNP 2", 3, sampleNamesInRod.get(1).size());
|
||||
Assert.assertEquals("That there are three Genotypes in SNP 3",3,snp3.size());
|
||||
|
||||
|
||||
List<Allele> snp1_individual3_alleles = snp1.get(2).getAlleles();
|
||||
List<Allele> snp3_individual2_alleles = snp3.get(1).getAlleles();
|
||||
|
||||
String alleleStr1 = new String(snp1_individual3_alleles.get(0).getBases())+new String(snp1_individual3_alleles.get(1).getBases());
|
||||
String alleleStr2 = new String(snp3_individual2_alleles.get(0).getBases())+new String(snp3_individual2_alleles.get(1).getBases());
|
||||
|
||||
Assert.assertEquals("That the third genotype of snp 1 is correctly no-call","00",alleleStr1);
|
||||
Assert.assertEquals("That the second genotype of snp 3 is correctly G G","GG",alleleStr2);
|
||||
|
||||
boolean name2isSame = true;
|
||||
|
||||
for ( ArrayList<String> names : sampleNamesInRod ) {
|
||||
name2isSame = name2isSame && names.get(1).equals("NAMELY");
|
||||
}
|
||||
|
||||
Assert.assertTrue("That the second name of all the genotypes is the same and is correct",name2isSame);
|
||||
|
||||
// test that the loci are correctly parsed and in order
|
||||
|
||||
List<String> expectedLoci = Arrays.asList("1:123456","2:13274","3:11111");
|
||||
boolean lociCorrect = true;
|
||||
for ( int i = 0; i < 3; i ++ ) {
|
||||
lociCorrect = lociCorrect && lociInRod.get(i).toString().equals(expectedLoci.get(i));
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testStandardPedFileWithIndels() {
|
||||
PlinkRod rod = new PlinkRod("test");
|
||||
try {
|
||||
rod.initialize(new File("/humgen/gsa-hpprojects/GATK/data/Validation_Data/test/plink_rod_test/standard_plink_with_indels.ped") );
|
||||
} catch ( FileNotFoundException e) {
|
||||
throw new StingException("Test file for testStandardPedFileWithIndels() could not be found", e);
|
||||
}
|
||||
|
||||
// Iterate through the rod
|
||||
|
||||
List<ArrayList<Genotype>> genotypesInRod = new ArrayList<ArrayList<Genotype>>();
|
||||
ArrayList<ArrayList<String>> sampleNamesInRod = new ArrayList<ArrayList<String>>();
|
||||
ArrayList<GenomeLoc> lociInRod = new ArrayList<GenomeLoc>();
|
||||
ArrayList<Boolean> snpSites = new ArrayList<Boolean>();
|
||||
do {
|
||||
genotypesInRod.add(rod.getGenotypes());
|
||||
sampleNamesInRod.add(rod.getVariantSampleNames());
|
||||
lociInRod.add(rod.getLocation());
|
||||
snpSites.add(rod.variantIsSNP());
|
||||
} while ( rod.parseLine(null,null) );
|
||||
|
||||
boolean snpOrder = true;
|
||||
List<Boolean> expectedOrder = Arrays.asList(true,false,true,false);
|
||||
for ( int i = 0; i < 4; i ++ ) {
|
||||
snpOrder = snpOrder && ( expectedOrder.get(i) == snpSites.get(i) );
|
||||
}
|
||||
|
||||
Assert.assertTrue("That the variant type order is as expected", snpOrder);
|
||||
// Assert.assertTrue("That the second genotype of second variant is not a point mutation", ! genotypesInRod.get(1).get(1). );
|
||||
// Assert.assertTrue("That the second genotype of fourth variant is not a point mutation", ! genotypesInRod.get(3).get(1).isPointGenotype() );
|
||||
Assert.assertTrue("That the second genotype of fourth variant is homozygous", genotypesInRod.get(3).get(1).isHom());
|
||||
Assert.assertTrue("That the fourth genotype of fourth variant is heterozygous",genotypesInRod.get(3).get(3).isHet());
|
||||
Assert.assertEquals("That the reference deletion genotype has the correct string", "ATTTAT",genotypesInRod.get(3).get(2).getAlleles().get(0).getBases());
|
||||
Assert.assertEquals("That the insertion bases are correct","CTC",genotypesInRod.get(1).get(2).getAlleles().get(0).getBases());
|
||||
Assert.assertEquals("That the snp bases are correct","GC",new String(genotypesInRod.get(2).get(2).getAlleles().get(0).getBases())+new String(genotypesInRod.get(2).get(2).getAlleles().get(1).getBases()));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBinaryPedFileNoIndels() {
|
||||
PlinkRod rod = new PlinkRod("binaryTest1");
|
||||
try {
|
||||
rod.initialize(new File("/humgen/gsa-hpprojects/GATK/data/Validation_Data/test/plink_rod_test/binary_noindel_test.bed"));
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new StingException("Test file for testBinaryPedFileNoIndels() could not be found",e);
|
||||
}
|
||||
|
||||
// iterate through the ROD and get stuff
|
||||
ArrayList<GenomeLoc> lociInRod = new ArrayList<GenomeLoc>();
|
||||
ArrayList<ArrayList<Genotype>> genotypesInRod = new ArrayList<ArrayList<Genotype>>();
|
||||
ArrayList<ArrayList<String>> samplesInRod = new ArrayList<ArrayList<String>>();
|
||||
|
||||
do {
|
||||
lociInRod.add(rod.getLocation());
|
||||
genotypesInRod.add(rod.getGenotypes());
|
||||
samplesInRod.add(rod.getVariantSampleNames());
|
||||
} while ( rod.parseLine(null,null) );
|
||||
|
||||
List<String> expecLoc = Arrays.asList("1:123456","1:14327877","2:22074511","3:134787","3:178678","4:829645","4:5234132","12:1268713");
|
||||
|
||||
for ( int i = 0; i < expecLoc.size(); i ++ ) {
|
||||
Assert.assertEquals("That locus "+(i+1)+" in the rod is correct", expecLoc.get(i), lociInRod.get(i).toString());
|
||||
}
|
||||
|
||||
List<String> expecAlleles = Arrays.asList("AA","AA","AA","GG","GG","GG","AA","TA","TT","CC","CC","GC","TC","CC","TT",
|
||||
"GG","GG","AG","TT","CC","CT","TG","GG","GG");
|
||||
List<Boolean> expecHet = Arrays.asList(false,false,false,false,false,false,false,true,false,false,false,true,true,false,
|
||||
false,false,false,true,false,false,true,true,false,false);
|
||||
List<String> expecName = Arrays.asList("NA12878","NA12890","NA07000","NA12878","NA12890","NA07000","NA12878","NA12890","NA07000",
|
||||
"NA12878","NA12890","NA07000","NA12878","NA12890","NA07000","NA12878","NA12890","NA07000","NA12878","NA12890","NA07000",
|
||||
"NA12878","NA12890","NA07000");
|
||||
int snpNo = 1;
|
||||
int indiv = 1;
|
||||
int alleleOffset = 0;
|
||||
for ( ArrayList<Genotype> snp : genotypesInRod ) {
|
||||
for ( Genotype gen : snp ) {
|
||||
String alStr = new String(gen.getAlleles().get(0).getBases())+new String(gen.getAlleles().get(1).getBases());
|
||||
Assert.assertEquals("That the allele of person "+indiv+" for snp "+snpNo+" is correct "+
|
||||
"(allele offset "+alleleOffset+")", expecAlleles.get(alleleOffset),alStr);
|
||||
Assert.assertEquals("That the genotype of person "+indiv+" for snp "+snpNo+" is properly set", expecHet.get(alleleOffset),gen.isHet());
|
||||
Assert.assertEquals("That the name of person "+indiv+" for snp "+snpNo+" is correct", expecName.get(alleleOffset),samplesInRod.get(snpNo-1).get(indiv-1));
|
||||
indiv++;
|
||||
alleleOffset++;
|
||||
}
|
||||
indiv = 1;
|
||||
snpNo++;
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testIndelBinary() {
|
||||
PlinkRod rod = new PlinkRod("binaryTest2");
|
||||
try {
|
||||
rod.initialize(new File("/humgen/gsa-hpprojects/GATK/data/Validation_Data/test/plink_rod_test/binary_indel_test.bed"));
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new StingException("Test file for testBinaryPedFileNoIndels() could not be found",e);
|
||||
}
|
||||
|
||||
ArrayList<ArrayList<Genotype>> genotypesInRod = new ArrayList<ArrayList<Genotype>>();
|
||||
do {
|
||||
genotypesInRod.add(rod.getGenotypes());
|
||||
} while ( rod.parseLine(null,null) );
|
||||
|
||||
List<String> expecAlleles = Arrays.asList("ACCA","","ACCAACCA","GGGG","GG","","AA","TA","00","","CCTCCT","CCT",
|
||||
"TC","CC","TT","GG","GG","AG","","CTTGCTTG","CTTG","TG","GG","GG");
|
||||
List<Boolean> expecDeletion = Arrays.asList(false,false,false,false,false,false,false,false,false,true,false,true,
|
||||
false,false,false,false,false,false,true,false,true,false,false,false);
|
||||
List<Boolean> expecInsertion = Arrays.asList(true,false,true,true,true,false,false,false,false,false,false,false,
|
||||
false,false,false,false,false,false,false,false,false,false,false,false);
|
||||
|
||||
int al = 0;
|
||||
for ( ArrayList<Genotype> snp : genotypesInRod ) {
|
||||
for ( Genotype gen : snp ) {
|
||||
String alStr = new String(gen.getAlleles().get(0).getBases())+new String(gen.getAlleles().get(1).getBases());
|
||||
Allele firstAl = gen.getAlleles().get(0);
|
||||
Allele secondAl = gen.getAlleles().get(1);
|
||||
// boolean isInsertion = ( firstAl.getType() == Allele.AlleleType.INSERTION || secondAl.getType() == Allele.AlleleType.INSERTION );
|
||||
// boolean isDeletion = ( firstAl.getType() == Allele.AlleleType.DELETION || secondAl.getType() == Allele.AlleleType.DELETION );
|
||||
Assert.assertEquals("That the indel file has the correct alleles for genotype "+al,expecAlleles.get(al), alStr);
|
||||
// Assert.assertEquals("That the insertion property of genotype "+al+" is correct",expecInsertion.get(al),isInsertion);
|
||||
// Assert.assertEquals("That the deletion property of genotype "+al+" is correct", expecDeletion.get(al), isDeletion);
|
||||
al++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}*/
|
||||
|
|
@ -1,236 +0,0 @@
|
|||
// our package
|
||||
package org.broadinstitute.sting.gatk.refdata;
|
||||
|
||||
|
||||
// the imports for unit testing.
|
||||
|
||||
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeatureIterator;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.junit.Before;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.PrintStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
/**
|
||||
* Basic unit test for TabularROD
|
||||
*
|
||||
*/
|
||||
public class TabularRODUnitTest extends BaseTest {
|
||||
private static ReferenceSequenceFile seq;
|
||||
private ReferenceOrderedData ROD;
|
||||
private LocationAwareSeekableRODIterator iter;
|
||||
|
||||
|
||||
@BeforeClass
|
||||
public static void init() throws FileNotFoundException {
|
||||
// sequence
|
||||
seq = new IndexedFastaSequenceFile(new File(hg18Reference));
|
||||
GenomeLocParser.setupRefContigOrdering(seq);
|
||||
}
|
||||
|
||||
@Before
|
||||
public void setupTabularROD() {
|
||||
TabularROD.setDelimiter(TabularROD.DEFAULT_DELIMITER, TabularROD.DEFAULT_DELIMITER_REGEX);
|
||||
File file = new File(testDir + "TabularDataTest.dat");
|
||||
ROD = new ReferenceOrderedData("tableTest", file, TabularROD.class);
|
||||
iter = new SeekableRODIterator(new GATKFeatureIterator(ROD.iterator()));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test1() {
|
||||
logger.warn("Executing test1");
|
||||
RODRecordList oneList = iter.next();
|
||||
TabularROD one = (TabularROD)oneList.get(0).getUnderlyingObject();
|
||||
assertTrue(one.size() == 4);
|
||||
assertTrue(one.getLocation().equals(GenomeLocParser.createGenomeLoc("chrM", 10)));
|
||||
assertTrue(one.get("COL1").equals("A"));
|
||||
assertTrue(one.get("COL2").equals("B"));
|
||||
assertTrue(one.get("COL3").equals("C"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test2() {
|
||||
logger.warn("Executing test2");
|
||||
RODRecordList oneList = iter.next();
|
||||
RODRecordList twoList = iter.next();
|
||||
TabularROD one = (TabularROD)oneList.get(0).getUnderlyingObject();
|
||||
TabularROD two = (TabularROD)twoList.get(0).getUnderlyingObject();
|
||||
assertTrue(two.size() == 4);
|
||||
assertTrue(two.getLocation().equals(GenomeLocParser.createGenomeLoc("chrM", 20)));
|
||||
assertTrue(two.get("COL1").equals("C"));
|
||||
assertTrue(two.get("COL2").equals("D"));
|
||||
assertTrue(two.get("COL3").equals("E"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test3() {
|
||||
logger.warn("Executing test3");
|
||||
RODRecordList oneList = iter.next();
|
||||
RODRecordList twoList = iter.next();
|
||||
RODRecordList threeList = iter.next();
|
||||
TabularROD one = (TabularROD)oneList.get(0).getUnderlyingObject();
|
||||
TabularROD two = (TabularROD)twoList.get(0).getUnderlyingObject();
|
||||
TabularROD three = (TabularROD)threeList.get(0).getUnderlyingObject();
|
||||
assertTrue(three.size() == 4);
|
||||
assertTrue(three.getLocation().equals(GenomeLocParser.createGenomeLoc("chrM", 30)));
|
||||
assertTrue(three.get("COL1").equals("F"));
|
||||
assertTrue(three.get("COL2").equals("G"));
|
||||
assertTrue(three.get("COL3").equals("H"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDone() {
|
||||
logger.warn("Executing testDone");
|
||||
RODRecordList oneList = iter.next();
|
||||
RODRecordList twoList = iter.next();
|
||||
RODRecordList threeList = iter.next();
|
||||
TabularROD one = (TabularROD)oneList.get(0).getUnderlyingObject();
|
||||
TabularROD two = (TabularROD)twoList.get(0).getUnderlyingObject();
|
||||
TabularROD three = (TabularROD)threeList.get(0).getUnderlyingObject();
|
||||
assertTrue(!iter.hasNext());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSeek() {
|
||||
logger.warn("Executing testSeek");
|
||||
RODRecordList twoList = iter.seekForward(GenomeLocParser.createGenomeLoc("chrM", 20));
|
||||
TabularROD two = (TabularROD)twoList.get(0).getUnderlyingObject();
|
||||
assertTrue(two.size() == 4);
|
||||
assertTrue(two.getLocation().equals(GenomeLocParser.createGenomeLoc("chrM", 20)));
|
||||
assertTrue(two.get("COL1").equals("C"));
|
||||
assertTrue(two.get("COL2").equals("D"));
|
||||
assertTrue(two.get("COL3").equals("E"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testToString() {
|
||||
logger.warn("Executing testToString");
|
||||
RODRecordList oneList = iter.next();
|
||||
TabularROD one = (TabularROD)oneList.get(0).getUnderlyingObject();
|
||||
assertTrue(one.toString().equals("chrM:10\tA\tB\tC"));
|
||||
}
|
||||
|
||||
// Didn't change the delimiter
|
||||
@Test (expected = RuntimeException.class)
|
||||
public void testDelim1() {
|
||||
File file2 = new File(testDir + "TabularDataTest2.dat");
|
||||
ReferenceOrderedData ROD_commas = new ReferenceOrderedData("tableTest", file2, TabularROD.class);
|
||||
LocationAwareSeekableRODIterator iter_commas = new SeekableRODIterator(new GATKFeatureIterator(ROD_commas.iterator()));
|
||||
|
||||
logger.warn("Executing testDelim1");
|
||||
RODRecordList one2List = iter_commas.next();
|
||||
TabularROD one2 = (TabularROD)one2List.get(0).getUnderlyingObject();
|
||||
assertTrue(one2.size() == 5);
|
||||
assertTrue(one2.getLocation().equals(GenomeLocParser.createGenomeLoc("chrM", 10)));
|
||||
assertTrue(one2.get("COL1").equals("A"));
|
||||
assertTrue(one2.get("COL2").equals("B"));
|
||||
assertTrue(one2.get("COL3").equals("C"));
|
||||
assertTrue(one2.get("COL4").equals("1"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDelim2() {
|
||||
TabularROD.setDelimiter(",",",");
|
||||
File file2 = new File(testDir + "TabularDataTest2.dat");
|
||||
ReferenceOrderedData ROD_commas = new ReferenceOrderedData("tableTest", file2, TabularROD.class);
|
||||
LocationAwareSeekableRODIterator iter_commas = new SeekableRODIterator(new GATKFeatureIterator(ROD_commas.iterator()));
|
||||
|
||||
logger.warn("Executing testDelim1");
|
||||
RODRecordList one2List = iter_commas.next();
|
||||
TabularROD one2 = (TabularROD)one2List.get(0).getUnderlyingObject();
|
||||
assertTrue(one2.size() == 5);
|
||||
assertTrue(one2.getLocation().equals(GenomeLocParser.createGenomeLoc("chrM", 10)));
|
||||
assertTrue(one2.get("COL1").equals("A"));
|
||||
assertTrue(one2.get("COL2").equals("B"));
|
||||
assertTrue(one2.get("COL3").equals("C"));
|
||||
assertTrue(one2.get("COL4").equals("1"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCreation() {
|
||||
logger.warn("Executing testCreation");
|
||||
ArrayList<String> header = new ArrayList<String>(Arrays.asList("HEADER", "col1", "col2", "col3"));
|
||||
assertTrue(TabularROD.headerString(header).equals("HEADER\tcol1\tcol2\tcol3"));
|
||||
String rowData = String.format("%d %d %d", 1, 2, 3);
|
||||
TabularROD row = new TabularROD("myName", header, GenomeLocParser.createGenomeLoc("chrM", 1), rowData.split(" "));
|
||||
System.out.println(">>>>> " + row.toString());
|
||||
assertTrue(row.toString().equals("chrM:1\t1\t2\t3"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCreationAndWriting() throws FileNotFoundException {
|
||||
logger.warn("Executing testCreationAndWriting");
|
||||
|
||||
File outputFile = new File(testDir + "testTabularRodOutputTemp.dat");
|
||||
PrintStream out = new PrintStream(new FileOutputStream(outputFile));
|
||||
|
||||
ArrayList<String> header = new ArrayList<String>(Arrays.asList("HEADER", "col1", "col2", "col3"));
|
||||
out.println(TabularROD.commentString("Hello, created from test"));
|
||||
out.println(TabularROD.commentString(""));
|
||||
out.println(TabularROD.headerString(header));
|
||||
|
||||
String rowData = String.format("%d %d %d", 1, 2, 3);
|
||||
TabularROD row = new TabularROD("myName", header, GenomeLocParser.createGenomeLoc("chrM", 1), rowData.split(" "));
|
||||
out.println(row.toString());
|
||||
|
||||
rowData = String.format("%d %d %d", 3, 4, 5);
|
||||
row = new TabularROD("myName", header, GenomeLocParser.createGenomeLoc("chrM", 2), rowData.split(" "));
|
||||
out.println(row.toString());
|
||||
|
||||
ReferenceOrderedData ROD_commas = new ReferenceOrderedData("tableTest", outputFile, TabularROD.class);
|
||||
LocationAwareSeekableRODIterator iter_commas = new SeekableRODIterator(new GATKFeatureIterator(ROD_commas.iterator()));
|
||||
|
||||
RODRecordList oneList = iter_commas.next();
|
||||
TabularROD one = (TabularROD)oneList.get(0).getUnderlyingObject();
|
||||
assertTrue(one.size() == 4);
|
||||
assertTrue(one.getLocation().equals(GenomeLocParser.createGenomeLoc("chrM", 1)));
|
||||
assertTrue(one.get("col1").equals("1"));
|
||||
assertTrue(one.get("col2").equals("2"));
|
||||
assertTrue(one.get("col3").equals("3"));
|
||||
|
||||
RODRecordList twoList = iter_commas.next();
|
||||
TabularROD two = (TabularROD)twoList.get(0).getUnderlyingObject();
|
||||
assertTrue(two.size() == 4);
|
||||
assertTrue(two.getLocation().equals(GenomeLocParser.createGenomeLoc("chrM", 2)));
|
||||
assertTrue(two.get("col1").equals("3"));
|
||||
assertTrue(two.get("col2").equals("4"));
|
||||
assertTrue(two.get("col3").equals("5"));
|
||||
}
|
||||
|
||||
/* @Test (expected=RuntimeException.class )
|
||||
public void testBadHeader1() {
|
||||
logger.warn("Executing testBadHeader1");
|
||||
ArrayList<String> header = new ArrayList<String>();
|
||||
TabularROD row = new TabularROD("myName", header, GenomeLocParser.createGenomeLoc("chrM", 1));
|
||||
}*/
|
||||
|
||||
/* @Test (expected=RuntimeException.class )
|
||||
public void testBadHeader2() {
|
||||
logger.warn("Executing testBadHeader2");
|
||||
ArrayList<String> header = new ArrayList<String>(Arrays.asList("col1", "col2", "col3"));
|
||||
TabularROD row = new TabularROD("myName", header, GenomeLocParser.createGenomeLoc("chrM", 1));
|
||||
}*/
|
||||
|
||||
@Test (expected=RuntimeException.class )
|
||||
public void testBadData1() {
|
||||
logger.warn("Executing testBadData1");
|
||||
ArrayList<String> header = new ArrayList<String>(Arrays.asList("HEADER", "col1", "col2", "col3"));
|
||||
assertTrue(TabularROD.headerString(header).equals("HEADER\tcol1\tcol2\tcol3"));
|
||||
String rowData = String.format("%d %d %d %d", 1, 2, 3, 4);
|
||||
TabularROD row = new TabularROD("myName", header, GenomeLocParser.createGenomeLoc("chrM", 1), rowData.split(" "));
|
||||
}
|
||||
}
|
||||
|
|
@ -1,137 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010. The Broad Institute
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.refdata.tracks;
|
||||
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* class RMDTrackManagerUnitTest
|
||||
* tests out the ability of the RMDTrackManager to correctly create RMDtracks based on the requested types.
|
||||
*/
|
||||
public class RMDTrackManagerUnitTest extends BaseTest {
|
||||
List<String> triplets;
|
||||
List<RMDTrack> tracks;
|
||||
|
||||
@Before
|
||||
public void setup() {
|
||||
RMDTrackManager manager = new RMDTrackManager();
|
||||
triplets = new ArrayList<String>();
|
||||
|
||||
// add our db snp data
|
||||
triplets.add("MyDbSNP,DBSNP,testdata/small.dbsnp.rod");
|
||||
// TODO: Aaron remove following comment, reinstate line
|
||||
//tracks = manager.getReferenceMetaDataSources(triplets);
|
||||
}
|
||||
|
||||
@Test // TODO: Aaron remove me
|
||||
public void voidTest() {
|
||||
|
||||
}
|
||||
|
||||
//@Test -- TODO: Aaron fix with next round of Tribble integration
|
||||
public void testBuilderQuery() {
|
||||
for (RMDTrack t : tracks) {
|
||||
System.err.println("name = " + t.getName() + " type = " + t.getType().getSimpleName() + " file = " + t.getFile());
|
||||
int count = 0;
|
||||
Iterator<GATKFeature> fIter;
|
||||
try {
|
||||
fIter = ((TribbleTrack) t).query("1", 1, 5000);
|
||||
} catch (IOException e) {
|
||||
throw new ReviewedStingException("blah I/O exception");
|
||||
}
|
||||
while (fIter.hasNext()) {
|
||||
fIter.next();
|
||||
count++;
|
||||
}
|
||||
Assert.assertEquals(100, count);
|
||||
}
|
||||
}
|
||||
|
||||
//@Test
|
||||
public void testBuilderIterator() {
|
||||
for (RMDTrack t : tracks) {
|
||||
System.err.println("name = " + t.getName() + " type = " + t.getType().getSimpleName() + " file = " + t.getFile());
|
||||
int count = 0;
|
||||
Iterator<GATKFeature> fIter = t.getIterator();
|
||||
while (fIter.hasNext()) {
|
||||
fIter.next();
|
||||
count++;
|
||||
}
|
||||
Assert.assertEquals(100, count);
|
||||
}
|
||||
}
|
||||
|
||||
// @Test used only to determine how fast queries are, don't uncomment! (unless you know what you're doing).
|
||||
public void testSpeedOfRealQuery() {
|
||||
IndexedFastaSequenceFile file = null;
|
||||
file = new IndexedFastaSequenceFile(new File(b36KGReference));
|
||||
final int intervalSize = 10000000;
|
||||
GenomeLocParser.setupRefContigOrdering(file.getSequenceDictionary());
|
||||
RMDTrackManager manager = new RMDTrackManager();
|
||||
// add our db snp data
|
||||
triplets.clear();
|
||||
triplets.add("db");
|
||||
triplets.add("DBSNP");
|
||||
triplets.add("../../GATK_Data/dbsnp_130_b36.rod");
|
||||
Assert.assertEquals(1, manager.getReferenceMetaDataSources(null,triplets).size());
|
||||
RMDTrack t = manager.getReferenceMetaDataSources(null,triplets).get(0);
|
||||
// make sure we have a single track
|
||||
// lets test the first and 20th contigs of the human reference
|
||||
|
||||
for (int loop = 1; loop <= 22; loop++) {
|
||||
SAMSequenceRecord seqRec = GenomeLocParser.getContigInfo(String.valueOf(loop));
|
||||
String name = seqRec.getSequenceName();
|
||||
Iterator<GATKFeature> fIter;
|
||||
for (int x = 1; x < seqRec.getSequenceLength() - intervalSize; x += intervalSize) {
|
||||
long firstTime = System.currentTimeMillis();
|
||||
long count = 0;
|
||||
try {
|
||||
fIter = ((TribbleTrack) t).query("1", x, x + intervalSize);
|
||||
} catch (IOException e) {
|
||||
throw new ReviewedStingException("blah I/O exception");
|
||||
}
|
||||
while (fIter.hasNext()) {
|
||||
fIter.next();
|
||||
count++;
|
||||
}
|
||||
System.err.println(name + "," + count + "," + (System.currentTimeMillis() - firstTime));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1,274 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.refdata.tracks.builders;
|
||||
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import org.apache.log4j.Level;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broad.tribble.index.Index;
|
||||
import org.broad.tribble.index.linear.LinearIndex;
|
||||
import org.broad.tribble.iterators.CloseableTribbleIterator;
|
||||
import org.broad.tribble.source.BasicFeatureSource;
|
||||
import org.broad.tribble.util.LittleEndianOutputStream;
|
||||
import org.broad.tribble.vcf.VCFCodec;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.refdata.features.annotator.AnnotatorInputTableCodec;
|
||||
import org.broadinstitute.sting.gatk.refdata.features.annotator.AnnotatorInputTableFeature;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* performance tests for different index types
|
||||
*/
|
||||
public class IndexPerformanceTests extends BaseTest {
|
||||
// the RMD track builder
|
||||
private TribbleRMDTrackBuilder builder;
|
||||
|
||||
// set the logger level
|
||||
Logger logger = Logger.getLogger(IndexPerformanceTests.class);
|
||||
|
||||
// the input files to test
|
||||
Map<String, File> inputFiles = new LinkedHashMap<String,File>();
|
||||
|
||||
// the input types
|
||||
Map<String, Class> inputTypes = new HashMap<String,Class>();
|
||||
|
||||
// where the vcf files are located
|
||||
String fileLocation = validationDataLocation + "Index_Performance_Data/";
|
||||
|
||||
// bin sizes to try
|
||||
int[] binSizes = {10, 100, 1000, 5000, 10000, 50000};
|
||||
|
||||
PrintWriter writer;
|
||||
PrintWriter writer2;
|
||||
/** setup the files we're going to run with, including their names */
|
||||
@Before
|
||||
public void setupFilesAndIndexes() {
|
||||
logger.setLevel(Level.INFO);
|
||||
builder = new TribbleRMDTrackBuilder();
|
||||
IndexedFastaSequenceFile seq = new IndexedFastaSequenceFile(new File(hg18Reference));
|
||||
GenomeLocParser.setupRefContigOrdering(seq);
|
||||
|
||||
// the input files
|
||||
/*inputFiles.put("\"10\"",new File(fileLocation + "tip10.vcf"));
|
||||
inputFiles.put("\"100\"",new File(fileLocation + "tip100.vcf"));
|
||||
inputFiles.put("\"1,000\"",new File(fileLocation + "tip1000.vcf"));
|
||||
inputFiles.put("\"10,000\"",new File(fileLocation + "tip10000.vcf"));
|
||||
inputFiles.put("\"100,000\"",new File(fileLocation + "tip100000.vcf"));
|
||||
inputFiles.put("\"1,000,000\"",new File(fileLocation + "tip1000000.vcf"));*/
|
||||
|
||||
for (String name : inputFiles.keySet()) {
|
||||
inputTypes.put(name,VCFCodec.class);
|
||||
}
|
||||
inputFiles.put("Big Table",new File("/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/slowAnnotator/big.table.txt"));
|
||||
inputTypes.put("Big Table", AnnotatorInputTableCodec.class);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void emptyTest() {
|
||||
// do nothing
|
||||
}
|
||||
|
||||
//@Test
|
||||
public void performanceTest() {
|
||||
try {
|
||||
writer = new PrintWriter(new FileWriter("testOutput_linear.txt"));
|
||||
writer2 = new PrintWriter(new FileWriter("testOutput_tree.txt"));
|
||||
} catch (IOException e) {
|
||||
Assert.fail("Unable to open file testOutput.txt");
|
||||
}
|
||||
writer.println("name,index,binSize,createTime,seekTime,thousandPerThousand,record_count,index_size");
|
||||
writer2.println("name,index,binSize,createTime,seekTime,thousandPerThousand,record_count,index_size");
|
||||
for (String name : inputFiles.keySet()) {
|
||||
for (int size : binSizes) {
|
||||
System.err.println("running " + name + " with bin size " + size);
|
||||
printTestLine(name, true, size);
|
||||
printTestLine(name, false, size);
|
||||
}
|
||||
}
|
||||
writer.close();
|
||||
writer2.close();
|
||||
}
|
||||
|
||||
private void printTestLine(String name, boolean useLinear, int size) {
|
||||
PrintWriter wr = (useLinear) ? writer : writer2;
|
||||
List<Long> values = performIndexTest(name,useLinear, size);
|
||||
wr.print(name + "," + ((useLinear) ? "linear" : "tree") + "," + size);
|
||||
for (Long l : values) {
|
||||
wr.print(",");
|
||||
wr.print(l);
|
||||
}
|
||||
wr.println();
|
||||
}
|
||||
|
||||
/**
|
||||
* time various tasks using the specified index
|
||||
* @param name the name to get
|
||||
* @return a five-piece: the time to create the index, the time to seek to chromosome 1, and the time to process reading
|
||||
* every other 1000 bases of chr1 (of the first 100M), the count of records seen in the last operation, and the index size
|
||||
*/
|
||||
public List<Long> performIndexTest(String name, boolean useLinear, int size) {
|
||||
deleteIndex(inputFiles.get(name));
|
||||
// time creating the index
|
||||
long createTime = System.currentTimeMillis();
|
||||
Pair<BasicFeatureSource, SAMSequenceDictionary> pairing = builder.createFeatureReader(inputTypes.get(name),inputFiles.get(name));
|
||||
createTime = System.currentTimeMillis() - createTime;
|
||||
//System.err.println("index creation took " + createTime);
|
||||
|
||||
// seek to chr1
|
||||
long seekTo1 = seekToChr1(pairing);
|
||||
|
||||
// seek every 1000 bases in Chr1
|
||||
long count = 0;
|
||||
long thousandEveryThousand = System.currentTimeMillis();
|
||||
try {
|
||||
for (int x = 1; x < 1000000; x = x + 1000) {
|
||||
//CloseableTribbleIterator<Feature> iter = pairing.first.query("chr1", x+(int)Math.floor(Math.random()*1000), x+1000); // query
|
||||
CloseableTribbleIterator<Feature> iter = pairing.first.query("chr1", x, x+1000); // query
|
||||
for (Feature feat : iter) {
|
||||
count++;
|
||||
}
|
||||
}
|
||||
|
||||
} catch (IOException e) {
|
||||
Assert.fail("Unable to load file for query!!");
|
||||
}
|
||||
thousandEveryThousand = System.currentTimeMillis() - thousandEveryThousand;
|
||||
//System.err.println("thousand every thousand (for first million) took " + thousandEveryThousand);
|
||||
return Arrays.asList(createTime,seekTo1,thousandEveryThousand,count,new File(inputFiles.get(name) + ".idx").length());
|
||||
}
|
||||
|
||||
private long seekToChr1(Pair<BasicFeatureSource, SAMSequenceDictionary> pairing) {
|
||||
// time seeking to the first 1M bases of Chr1
|
||||
long seekTo1 = System.currentTimeMillis();
|
||||
try {
|
||||
CloseableTribbleIterator iter = pairing.first.query("chr1",1,10000000); // query
|
||||
} catch (IOException e) {
|
||||
Assert.fail("Unable to load file for query!!");
|
||||
}
|
||||
seekTo1 = System.currentTimeMillis() - seekTo1;
|
||||
//System.err.println("seeking to chr1 took " + seekTo1);
|
||||
return seekTo1;
|
||||
}
|
||||
|
||||
//@Test
|
||||
public void testBigTable() {
|
||||
// store the mapping of location -> variant; this will get big
|
||||
Map<GenomeLoc, Integer> features = new TreeMap<GenomeLoc,Integer>();
|
||||
|
||||
Map<Integer,Integer> bucketToCount = getMapOfFeatures(features,false);
|
||||
Pair<BasicFeatureSource, SAMSequenceDictionary> pairing;
|
||||
|
||||
Map<GenomeLoc, Integer> features2 = new TreeMap<GenomeLoc,Integer>();
|
||||
Map<Integer,Integer> bucketToCount2 = getMapOfFeatures(features2,true);
|
||||
|
||||
System.err.println("Summary: ");
|
||||
System.err.println("Summary: tree " + features.size());
|
||||
System.err.println("Summary: linear " + features2.size());
|
||||
|
||||
// compare the two
|
||||
for (Map.Entry<GenomeLoc,Integer> entry: features.entrySet()) {
|
||||
if (!features2.containsKey(entry.getKey())) {
|
||||
System.err.println("key " + entry + " missing from linear, count " + entry.getValue());
|
||||
|
||||
}
|
||||
else if (features2.get(entry.getKey()) != entry.getValue()) {
|
||||
System.err.println("counts are not equal at " +
|
||||
entry.getKey() +
|
||||
" features2.get(entry.getKey()) = " +
|
||||
features2.get(entry.getKey()) +
|
||||
" feature1 = " + entry.getValue());
|
||||
}
|
||||
if (features2.containsKey(entry.getKey())) features2.remove(entry.getKey());
|
||||
}
|
||||
System.err.println("Missing from the tree :");
|
||||
for (Map.Entry<GenomeLoc,Integer> entry2: features2.entrySet()) {
|
||||
System.err.println("Position " + entry2.getKey() + " count = " + entry2.getValue());
|
||||
}
|
||||
|
||||
for (Integer bucket : bucketToCount.keySet()) {
|
||||
if (!bucketToCount2.get(bucket).equals(bucketToCount.get(bucket))) {
|
||||
System.err.println("Bucket " + bucket + " tree != linear, " + bucketToCount2.get(bucket) + " " + bucketToCount.get(bucket));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private Map<Integer,Integer> getMapOfFeatures(Map<GenomeLoc, Integer> features, boolean useLinear) {
|
||||
File bigTable = new File("/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/slowAnnotator/big.table.txt");
|
||||
|
||||
deleteIndex(inputFiles.get("Big Table"));
|
||||
// time creating the index
|
||||
logger.warn("creating index, linear = " + useLinear);
|
||||
|
||||
Map<Integer,Integer> bucketToCount = new TreeMap<Integer,Integer>();
|
||||
Pair<BasicFeatureSource, SAMSequenceDictionary> pairing = builder.createFeatureReader(inputTypes.get("Big Table"),inputFiles.get("Big Table"));
|
||||
logger.warn("created index, traversing");
|
||||
try {
|
||||
for (Integer x = 5000; x < 6000; x = x + 1000) {
|
||||
int bucketCount = 0;
|
||||
CloseableTribbleIterator<Feature> iter = pairing.first.query("chr1", x, x+1000); // query
|
||||
for (Feature feat : iter) {
|
||||
GenomeLoc loc = GenomeLocParser.createGenomeLoc(feat.getChr(),feat.getStart(),feat.getEnd());
|
||||
if (loc.getStop() < 5000 || loc.getStart() > 6000) continue;
|
||||
int count = 0;
|
||||
if (features.containsKey(loc))
|
||||
count = features.get(loc)+1;
|
||||
features.put(loc,count);
|
||||
if (bucketToCount.containsKey(x)) bucketToCount.put(x,bucketToCount.get(x)+1);
|
||||
else bucketToCount.put(x,1);
|
||||
}
|
||||
//bucketToCount.put(x,bucketCount);
|
||||
}
|
||||
logger.warn("Done, returning");
|
||||
} catch (IOException e) {
|
||||
Assert.fail("Unable to load file for query!!");
|
||||
}
|
||||
return bucketToCount;
|
||||
}
|
||||
|
||||
//@Test
|
||||
public void testGetTreeIndexLocation() {
|
||||
File bigTable = new File("small.table.txt");
|
||||
|
||||
deleteIndex(bigTable);
|
||||
// time creating the index
|
||||
logger.warn("creating index");
|
||||
|
||||
Map<Integer,Integer> bucketToCount = new TreeMap<Integer,Integer>();
|
||||
Pair<BasicFeatureSource, SAMSequenceDictionary> pairing = builder.createFeatureReader(inputTypes.get("Big Table"),bigTable);
|
||||
logger.warn("created index, traversing");
|
||||
try {
|
||||
int count= 0;
|
||||
CloseableTribbleIterator<Feature> iter = null;
|
||||
for (int x = 5000; x < 6000; x = x + 1000)
|
||||
iter = pairing.first.query("chr1", x, x+1000); // query
|
||||
for (Feature feat : iter) {
|
||||
GenomeLoc loc = GenomeLocParser.createGenomeLoc(feat.getChr(),feat.getStart(),feat.getEnd());
|
||||
if (loc.getStop() < 5000 || loc.getStart() > 6000) continue;
|
||||
count++;
|
||||
System.err.println(feat.toString());
|
||||
}
|
||||
System.err.println(count);
|
||||
} catch (IOException e) {
|
||||
Assert.fail("Unable to load file for query!!");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void deleteIndex(File fl) {
|
||||
File indexFile = new File(fl + TribbleRMDTrackBuilder.indexExtension);
|
||||
boolean deleted = true;
|
||||
if (indexFile.exists())
|
||||
deleted = indexFile.delete();
|
||||
if (!deleted)
|
||||
Assert.fail("Unable to delete index file");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -42,16 +42,16 @@ import java.util.Map;
|
|||
/**
|
||||
* @author aaron
|
||||
* <p/>
|
||||
* Class TribbleRMDTrackBuilderUnitTest
|
||||
* Class RMDTrackBuilderUnitTest
|
||||
* <p/>
|
||||
* Testing out the builder for tribble Tracks
|
||||
*/
|
||||
public class TribbleRMDTrackBuilderUnitTest extends BaseTest {
|
||||
private TribbleRMDTrackBuilder builder;
|
||||
public class RMDTrackBuilderUnitTest extends BaseTest {
|
||||
private RMDTrackBuilder builder;
|
||||
|
||||
@Before
|
||||
public void setup() {
|
||||
builder = new TribbleRMDTrackBuilder();
|
||||
builder = new RMDTrackBuilder();
|
||||
IndexedFastaSequenceFile seq = new IndexedFastaSequenceFile(new File(b36KGReference));
|
||||
GenomeLocParser.setupRefContigOrdering(seq);
|
||||
}
|
||||
|
|
@ -73,8 +73,8 @@ public class TribbleRMDTrackBuilderUnitTest extends BaseTest {
|
|||
Assert.fail("IO exception unexpected" + e.getMessage());
|
||||
}
|
||||
// make sure we didn't write the file (check that it's timestamp is within bounds)
|
||||
//System.err.println(new File(vcfFile + TribbleRMDTrackBuilder.indexExtension).lastModified());
|
||||
Assert.assertTrue(Math.abs(1279591752000l - new File(vcfFile + TribbleRMDTrackBuilder.indexExtension).lastModified()) < 100);
|
||||
//System.err.println(new File(vcfFile + RMDTrackBuilder.indexExtension).lastModified());
|
||||
Assert.assertTrue(Math.abs(1279591752000l - new File(vcfFile + RMDTrackBuilder.indexExtension).lastModified()) < 100);
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -19,7 +19,7 @@ public class PileupWalkerIntegrationTest extends WalkerTest {
|
|||
String gatk_args = "-T Pileup -I " + validationDataLocation + "FHS_Pileup_Test.bam "
|
||||
+ "-R " + hg18Reference
|
||||
+ " -L chr15:46,347,148 -o %s";
|
||||
String expected_md5 = "052187dd2bf2516a027578c8775856a8";
|
||||
String expected_md5 = "872e89df166b90e06dd2737535c5d8b3";
|
||||
WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 1, Arrays.asList(expected_md5));
|
||||
executeTest("Testing the standard (no-indel) pileup on three merged FHS pools with 27 deletions in 969 bases", spec);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -83,7 +83,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testCountCovariatesUseOriginalQuals() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( validationDataLocation + "originalQuals.1kg.chr1.1-1K.bam", "72b79646061d78674a3752272823d47f");
|
||||
e.put( validationDataLocation + "originalQuals.1kg.chr1.1-1K.bam", "db6c0dca1ec121f8c2e6d20f4969dee5");
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
String bam = entry.getKey();
|
||||
|
|
@ -96,7 +96,8 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
" -L 1:1-1,000" +
|
||||
" -standard" +
|
||||
" -OQ" +
|
||||
" -recalFile %s",
|
||||
" -recalFile %s" +
|
||||
" --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod",
|
||||
1, // just one output file
|
||||
Arrays.asList(md5));
|
||||
executeTest("testCountCovariatesUseOriginalQuals", spec);
|
||||
|
|
|
|||
|
|
@ -6,7 +6,12 @@ import org.junit.Test;
|
|||
import java.util.Arrays;
|
||||
|
||||
public class SequenomValidationConverterIntegrationTest extends WalkerTest {
|
||||
@Test
|
||||
|
||||
public void testEmpty() {
|
||||
System.err.println("Reinstate these tests when plink is back in");
|
||||
}
|
||||
|
||||
//@Test TODO: reinstate the test when the Plink rod is back
|
||||
public void testSNPs() {
|
||||
String testPedFile = validationDataLocation + "Sequenom_Test_File.txt";
|
||||
String testArgs = "-R "+b36KGReference + " -T SequenomValidationConverter -B:sequenom,Plink "+testPedFile+" -o %s";
|
||||
|
|
|
|||
|
|
@ -1,6 +1,5 @@
|
|||
package org.broadinstitute.sting.playground.utils.report.templates;
|
||||
|
||||
import org.broadinstitute.sting.playground.utils.report.templates.TextTable;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ import org.broad.tribble.util.variantcontext.Genotype;
|
|||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broad.tribble.vcf.*;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.builders.TribbleRMDTrackBuilder;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
|
|
@ -76,7 +76,7 @@ public class VCFWriterUnitTest extends BaseTest {
|
|||
counter++;
|
||||
}
|
||||
Assert.assertEquals(2,counter);
|
||||
new File(fakeVCFFile + TribbleRMDTrackBuilder.indexExtension).delete();
|
||||
new File(fakeVCFFile + RMDTrackBuilder.indexExtension).delete();
|
||||
fakeVCFFile.delete();
|
||||
}
|
||||
catch (IOException e ) {
|
||||
|
|
|
|||
Loading…
Reference in New Issue