adding methods to get RODs by name or type in read traversals, performance improvements to RODs for Reads in general, and some more Tribble infrastructure.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2984 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
18ba9929f9
commit
661a043cef
|
|
@ -0,0 +1,49 @@
|
|||
package org.broadinstitute.sting.gatk.datasources.providers;
|
||||
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.utils.Pair;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* @author aaron
|
||||
*
|
||||
* Class RODMetaDataContainer
|
||||
*
|
||||
* stores both the name and the class for each ROD. This class assumes that:
|
||||
*
|
||||
* -Names must be unique
|
||||
* -Classes are allowed to have dupplicates
|
||||
*
|
||||
* This class encapsulates the ref data associations, and provides lookup by name and by
|
||||
* class type.
|
||||
*
|
||||
*/
|
||||
public class RODMetaDataContainer {
|
||||
// we only allow non-dupplicate ROD names, a HashMap is fine
|
||||
private final HashMap<String, ReferenceOrderedDatum> nameMap = new HashMap<String, ReferenceOrderedDatum>();
|
||||
|
||||
// we do allow duplicate class entries, so we need to store pairs of data
|
||||
private final List<Pair<Class, ReferenceOrderedDatum>> classMap = new ArrayList<Pair<Class, ReferenceOrderedDatum>>();
|
||||
|
||||
public void addEntry(ReferenceOrderedDatum data) {
|
||||
nameMap.put(data.getName(),data);
|
||||
classMap.add(new Pair<Class, ReferenceOrderedDatum>(data.getClass(),data));
|
||||
}
|
||||
|
||||
public Collection<ReferenceOrderedDatum> getSet(String name) {
|
||||
if (name == null) return nameMap.values();
|
||||
Set<ReferenceOrderedDatum> set = new HashSet<ReferenceOrderedDatum>();
|
||||
if (nameMap.containsKey(name)) set.add(nameMap.get(name));
|
||||
return set;
|
||||
}
|
||||
// the brute force (n) search ended up being faster than sorting and binary search in all but the most extreme cases (thousands of RODs at a location).
|
||||
public Collection<ReferenceOrderedDatum> getSet(Class cls) {
|
||||
Collection<ReferenceOrderedDatum> ret = new ArrayList<ReferenceOrderedDatum>();
|
||||
for (Pair<Class, ReferenceOrderedDatum> pair: classMap)
|
||||
if (pair.first.equals(cls)) ret.add(pair.second);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
|
@ -72,7 +72,7 @@ public class ReadBasedReferenceOrderedView implements View {
|
|||
/** stores a window of data, dropping RODs if we've passed the new reads start point. */
|
||||
class WindowedData {
|
||||
// the queue of possibly in-frame RODs; RODs are dropped removed as soon as they are out of scope
|
||||
private final TreeMap<Long, Set<ReferenceOrderedDatum>> mapping = new TreeMap<Long, Set<ReferenceOrderedDatum>>();
|
||||
private final TreeMap<Long, RODMetaDataContainer> mapping = new TreeMap<Long, RODMetaDataContainer>();
|
||||
|
||||
// our current location from the last read we processed
|
||||
private GenomeLoc currentLoc;
|
||||
|
|
@ -149,8 +149,8 @@ class WindowedData {
|
|||
RODRecordList list = state.iterator.next();
|
||||
for (ReferenceOrderedDatum datum : list) {
|
||||
if (!mapping.containsKey(list.getLocation().getStart()))
|
||||
mapping.put(list.getLocation().getStart(), new HashSet<ReferenceOrderedDatum>());
|
||||
mapping.get(list.getLocation().getStart()).add(datum);
|
||||
mapping.put(list.getLocation().getStart(), new RODMetaDataContainer());
|
||||
mapping.get(list.getLocation().getStart()).addEntry(datum);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -178,4 +178,4 @@ class RMDDataState {
|
|||
this.dataSource = dataSource;
|
||||
this.iterator = iterator;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@
|
|||
package org.broadinstitute.sting.gatk.refdata;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.RODMetaDataContainer;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
|
||||
|
|
@ -39,7 +40,9 @@ import java.util.*;
|
|||
*/
|
||||
public class ReadMetaDataTracker {
|
||||
private final SAMRecord record;
|
||||
private final TreeMap<Long, Set<ReferenceOrderedDatum>> mapping;
|
||||
|
||||
// the buffer of positions and RODs we've stored
|
||||
private final TreeMap<Long, RODMetaDataContainer> mapping;
|
||||
|
||||
/**
|
||||
* create a read meta data tracker, given the read and a queue of RODatum positions
|
||||
|
|
@ -47,7 +50,7 @@ public class ReadMetaDataTracker {
|
|||
* @param record the read to create offset from
|
||||
* @param mapping the mapping of reference ordered datum
|
||||
*/
|
||||
public ReadMetaDataTracker(SAMRecord record, TreeMap<Long, Set<ReferenceOrderedDatum>> mapping) {
|
||||
public ReadMetaDataTracker(SAMRecord record, TreeMap<Long, RODMetaDataContainer> mapping) {
|
||||
this.record = record;
|
||||
this.mapping = mapping;
|
||||
}
|
||||
|
|
@ -62,14 +65,22 @@ public class ReadMetaDataTracker {
|
|||
*
|
||||
* @return a mapping from the position in the read to the reference ordered datum
|
||||
*/
|
||||
private Map<Integer, Set<ReferenceOrderedDatum>> createReadAlignment(SAMRecord record, TreeMap<Long, Set<ReferenceOrderedDatum>> queue, Class cl, String name) {
|
||||
Map<Integer, Set<ReferenceOrderedDatum>> ret = new LinkedHashMap<Integer, Set<ReferenceOrderedDatum>>();
|
||||
private Map<Long, Collection<ReferenceOrderedDatum>> createReadAlignment(SAMRecord record, TreeMap<Long, RODMetaDataContainer> queue, Class cl, String name) {
|
||||
if (name != null && cl != null) throw new IllegalStateException("Both a class and name cannot be specified");
|
||||
Map<Long, Collection<ReferenceOrderedDatum>> ret = new LinkedHashMap<Long, Collection<ReferenceOrderedDatum>>();
|
||||
GenomeLoc location = GenomeLocParser.createGenomeLoc(record);
|
||||
int length = record.getReadLength();
|
||||
for (Long loc : queue.keySet()) {
|
||||
//if (location.containsP(loc)) {
|
||||
long position = loc - location.getStart();
|
||||
if (position >= 0 && position < length) ret.put((int)(position),queue.get(loc));
|
||||
Long position = loc - location.getStart();
|
||||
if (position >= 0 && position < length) {
|
||||
Collection<ReferenceOrderedDatum> set;
|
||||
if (cl != null)
|
||||
set = queue.get(loc).getSet(cl);
|
||||
else
|
||||
set = queue.get(loc).getSet(name);
|
||||
if (set != null && set.size() > 0)
|
||||
ret.put(position,set);
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
|
||||
|
|
@ -80,12 +91,16 @@ public class ReadMetaDataTracker {
|
|||
*
|
||||
* @return a mapping from the position in the read to the reference ordered datum
|
||||
*/
|
||||
private Map<Long, Set<ReferenceOrderedDatum>> createGenomeLocAlignment(SAMRecord record, TreeMap<Long, Set<ReferenceOrderedDatum>> mapping, Class cl, String name) {
|
||||
Map<Long, Set<ReferenceOrderedDatum>> ret = new LinkedHashMap<Long, Set<ReferenceOrderedDatum>>();
|
||||
private Map<Long, Collection<ReferenceOrderedDatum>> createGenomeLocAlignment(SAMRecord record, TreeMap<Long, RODMetaDataContainer> mapping, Class cl, String name) {
|
||||
Map<Long, Collection<ReferenceOrderedDatum>> ret = new LinkedHashMap<Long, Collection<ReferenceOrderedDatum>>();
|
||||
int start = record.getAlignmentStart();
|
||||
int stop = record.getAlignmentEnd();
|
||||
for (Long location : mapping.keySet()) {
|
||||
if (location >= start && location <= stop) ret.put(location,mapping.get(location));
|
||||
if (location >= start && location <= stop)
|
||||
if (cl != null)
|
||||
ret.put(location,mapping.get(location).getSet(cl));
|
||||
else
|
||||
ret.put(location,mapping.get(location).getSet(name));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
|
@ -95,7 +110,7 @@ public class ReadMetaDataTracker {
|
|||
*
|
||||
* @return a mapping of read offset to ROD(s)
|
||||
*/
|
||||
public Map<Integer, Set<ReferenceOrderedDatum>> getPositionMapping() {
|
||||
public Map<Long, Collection<ReferenceOrderedDatum>> getPositionMapping() {
|
||||
return createReadAlignment(record, mapping, null, null);
|
||||
}
|
||||
|
||||
|
|
@ -104,7 +119,43 @@ public class ReadMetaDataTracker {
|
|||
*
|
||||
* @return a mapping of genome loc position to ROD(s)
|
||||
*/
|
||||
public Map<Long, Set<ReferenceOrderedDatum>> getGenomeLocMapping() {
|
||||
public Map<Long, Collection<ReferenceOrderedDatum>> getGenomeLocMapping() {
|
||||
return createGenomeLocAlignment(record, mapping, null, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* get the position mapping, from read offset to ROD
|
||||
*
|
||||
* @return a mapping of read offset to ROD(s)
|
||||
*/
|
||||
public Map<Long, Collection<ReferenceOrderedDatum>> getPositionMapping(String name) {
|
||||
return createReadAlignment(record, mapping, null, name);
|
||||
}
|
||||
|
||||
/**
|
||||
* get the position mapping, from read offset to ROD
|
||||
*
|
||||
* @return a mapping of genome loc position to ROD(s)
|
||||
*/
|
||||
public Map<Long, Collection<ReferenceOrderedDatum>> getGenomeLocMapping(String name) {
|
||||
return createGenomeLocAlignment(record, mapping, null, name);
|
||||
}
|
||||
|
||||
/**
|
||||
* get the position mapping, from read offset to ROD
|
||||
*
|
||||
* @return a mapping of read offset to ROD(s)
|
||||
*/
|
||||
public Map<Long, Collection<ReferenceOrderedDatum>> getPositionMapping(Class cl) {
|
||||
return createReadAlignment(record, mapping, cl, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* get the position mapping, from read offset to ROD
|
||||
*
|
||||
* @return a mapping of genome loc position to ROD(s)
|
||||
*/
|
||||
public Map<Long, Collection<ReferenceOrderedDatum>> getGenomeLocMapping(Class cl) {
|
||||
return createGenomeLocAlignment(record, mapping, cl, null);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -23,7 +23,7 @@ import java.util.*;
|
|||
* Time: 10:47:14 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements Iterable<RODRecordList> { // }, RMDTrackBuilder {
|
||||
public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements Iterable<RODRecordList> {
|
||||
private String name;
|
||||
private File file = null;
|
||||
// private String fieldDelimiter;
|
||||
|
|
@ -36,32 +36,6 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
|
|||
/** our log, which we want to capture anything from this class */
|
||||
private static Logger logger = Logger.getLogger(ReferenceOrderedData.class);
|
||||
|
||||
/** @return a map of all available tracks we currently have access to create */
|
||||
//@Override
|
||||
public Map<String, Class> getAvailableTrackNamesAndTypes() {
|
||||
Map<String, Class> ret = new HashMap<String, Class>();
|
||||
for (RODBinding binding: Types.values())
|
||||
ret.put(binding.name, binding.type);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* create a RMDTrack of the specified type
|
||||
*
|
||||
* @param targetClass the target class of track
|
||||
* @param name what to call the track
|
||||
* @param inputFile the input file
|
||||
*
|
||||
* @return an instance of the track
|
||||
* @throws org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackCreationException
|
||||
* if we don't know of the target class or we couldn't create it
|
||||
*/
|
||||
//@Override
|
||||
public RMDTrack createInstanceOfTrack(Class targetClass, String name, File inputFile) throws RMDTrackCreationException {
|
||||
return new RODRMDTrack(targetClass, name, inputFile, parse1Binding(name,targetClass.getName(),inputFile.getAbsolutePath()));
|
||||
}
|
||||
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
//
|
||||
// Static ROD type management
|
||||
|
|
@ -191,7 +165,7 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
|
|||
* @param fileName
|
||||
* @return
|
||||
*/
|
||||
private static ReferenceOrderedData<?> parse1Binding(final String trackName, final String typeName, final String fileName) {
|
||||
public static ReferenceOrderedData<?> parse1Binding(final String trackName, final String typeName, final String fileName) {
|
||||
// Gracefully fail if we don't have the type
|
||||
if (ReferenceOrderedData.Types.get(typeName.toLowerCase()) == null)
|
||||
Utils.scareUser(String.format("Unknown ROD type: %s", typeName));
|
||||
|
|
|
|||
|
|
@ -258,8 +258,8 @@ public class SeekableRODIterator implements LocationAwareSeekableRODIterator {
|
|||
"the iterator's current contig");
|
||||
if ( interval.getContigIndex() == curr_contig ) {
|
||||
if ( interval.getStart() < curr_position )
|
||||
throw new StingException("Out of order query: query position "+interval.getStart()+" is located before "+
|
||||
"the iterator's current position "+curr_position);
|
||||
throw new StingException("Out of order query: query position "+interval +" is located before "+
|
||||
"the iterator's current position "+curr_contig + ":" + curr_position);
|
||||
if ( interval.getStop() < curr_query_end )
|
||||
throw new StingException("Unsupported querying sequence: current query interval " +
|
||||
interval+" ends before the end of previous query interval ("+curr_query_end+")");
|
||||
|
|
|
|||
|
|
@ -0,0 +1,88 @@
|
|||
package org.broadinstitute.sting.gatk.refdata.tracks.builders;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.refdata.*;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackCreationException;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.RODRMDTrack;
|
||||
import org.broadinstitute.sting.oneoffprojects.refdata.HapmapVCFROD;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
/**
|
||||
* @author aaron
|
||||
* <p/>
|
||||
* Class RODTrackBuilder
|
||||
* <p/>
|
||||
* the builder for tracks of the current ROD system, a holdover until Tribble supports binary and multi-line formats
|
||||
*/
|
||||
public class RODTrackBuilder implements RMDTrackBuilder {
|
||||
|
||||
/** our log, which we want to capture anything from this class */
|
||||
private static Logger logger = Logger.getLogger(ReferenceOrderedData.class);
|
||||
|
||||
public static HashMap<String, ReferenceOrderedData.RODBinding> Types = new HashMap<String, ReferenceOrderedData.RODBinding>();
|
||||
|
||||
public static void addModule(final String name, final Class<? extends ReferenceOrderedDatum> rodType) {
|
||||
final String boundName = name.toLowerCase();
|
||||
if (Types.containsKey(boundName)) {
|
||||
throw new RuntimeException(String.format("GATK BUG: adding ROD module %s that is already bound", boundName));
|
||||
}
|
||||
logger.info(String.format("* Adding rod class %s", name));
|
||||
Types.put(boundName, new ReferenceOrderedData.RODBinding(name, rodType));
|
||||
}
|
||||
|
||||
static {
|
||||
// All known ROD types
|
||||
addModule("GFF", RodGenotypeChipAsGFF.class);
|
||||
//addModule("dbSNP", rodDbSNP.class);
|
||||
addModule("HapMapAlleleFrequencies", HapMapAlleleFrequenciesROD.class);
|
||||
addModule("SAMPileup", rodSAMPileup.class);
|
||||
addModule("GELI", rodGELI.class);
|
||||
addModule("RefSeq", rodRefSeq.class);
|
||||
addModule("Table", TabularROD.class);
|
||||
addModule("PooledEM", PooledEMSNPROD.class);
|
||||
addModule("CleanedOutSNP", CleanedOutSNPROD.class);
|
||||
addModule("Sequenom", SequenomROD.class);
|
||||
addModule("SangerSNP", SangerSNPROD.class);
|
||||
addModule("SimpleIndel", SimpleIndelROD.class);
|
||||
addModule("PointIndel", PointIndelROD.class);
|
||||
addModule("HapMapGenotype", HapMapGenotypeROD.class);
|
||||
addModule("Intervals", IntervalRod.class);
|
||||
addModule("Variants", RodGeliText.class);
|
||||
addModule("GLF", RodGLF.class);
|
||||
addModule("VCF", RodVCF.class);
|
||||
addModule("PicardDbSNP", rodPicardDbSNP.class);
|
||||
addModule("HapmapVCF", HapmapVCFROD.class);
|
||||
addModule("Beagle", BeagleROD.class);
|
||||
addModule("Plink", PlinkRod.class);
|
||||
}
|
||||
|
||||
/**
|
||||
* create a RMDTrack of the specified type
|
||||
*
|
||||
* @param targetClass the target class of track
|
||||
* @param name what to call the track
|
||||
* @param inputFile the input file
|
||||
*
|
||||
* @return an instance of the track
|
||||
* @throws org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackCreationException
|
||||
* if we don't know of the target class or we couldn't create it
|
||||
*/
|
||||
//@Override
|
||||
public RMDTrack createInstanceOfTrack(Class targetClass, String name, File inputFile) throws RMDTrackCreationException {
|
||||
return new RODRMDTrack(targetClass, name, inputFile, ReferenceOrderedData.parse1Binding(name,targetClass.getName(),inputFile.getAbsolutePath()));
|
||||
}
|
||||
|
||||
/** @return a map of all available tracks we currently have access to create */
|
||||
//@Override
|
||||
public Map<String, Class> getAvailableTrackNamesAndTypes() {
|
||||
Map<String, Class> ret = new HashMap<String, Class>();
|
||||
for (ReferenceOrderedData.RODBinding binding: Types.values())
|
||||
ret.put(binding.name, binding.type);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
|
@ -77,13 +77,13 @@ public class ReadBasedReferenceOrderedViewTest extends BaseTest {
|
|||
}
|
||||
GenomeLoc start = GenomeLocParser.createGenomeLoc(0,0,0);
|
||||
List<RMDDataState> list = new ArrayList<RMDDataState>();
|
||||
list.add(new RMDDataState(null, new FakePeekingRODIterator(start)));
|
||||
list.add(new RMDDataState(null, new FakePeekingRODIterator(start,"fakeName")));
|
||||
ReadBasedReferenceOrderedView view = new ReadBasedReferenceOrderedView(new WindowedData(list));
|
||||
|
||||
for (SAMRecord rec : records) {
|
||||
ReadMetaDataTracker tracker = view.getReferenceOrderedDataForRead(rec);
|
||||
Map<Integer, Set<ReferenceOrderedDatum>> map = tracker.getPositionMapping();
|
||||
for (Integer i : map.keySet()) {
|
||||
Map<Long, Collection<ReferenceOrderedDatum>> map = tracker.getPositionMapping();
|
||||
for (Long i : map.keySet()) {
|
||||
Assert.assertEquals(1,map.get(i).size());
|
||||
}
|
||||
Assert.assertEquals(10,map.keySet().size());
|
||||
|
|
@ -99,8 +99,9 @@ class FakePeekingRODIterator implements LocationAwareSeekableRODIterator {
|
|||
// current location
|
||||
private GenomeLoc location;
|
||||
private ReadMetaDataTrackerTest.FakeRODatum curROD;
|
||||
|
||||
public FakePeekingRODIterator(GenomeLoc startingLoc) {
|
||||
private final String name;
|
||||
public FakePeekingRODIterator(GenomeLoc startingLoc, String name) {
|
||||
this.name = name;
|
||||
this.location = GenomeLocParser.createGenomeLoc(startingLoc.getContigIndex(),startingLoc.getStart()+1,startingLoc.getStop()+1);;
|
||||
}
|
||||
|
||||
|
|
@ -130,7 +131,7 @@ class FakePeekingRODIterator implements LocationAwareSeekableRODIterator {
|
|||
@Override
|
||||
public RODRecordList next() {
|
||||
System.err.println("Next -> " + location);
|
||||
curROD = new ReadMetaDataTrackerTest.FakeRODatum(location);
|
||||
curROD = new ReadMetaDataTrackerTest.FakeRODatum(location,name);
|
||||
location = GenomeLocParser.createGenomeLoc(location.getContigIndex(),location.getStart()+1,location.getStop()+1);
|
||||
FakeRODRecordList list = new FakeRODRecordList();
|
||||
list.add(curROD);
|
||||
|
|
|
|||
|
|
@ -26,6 +26,7 @@ package org.broadinstitute.sting.gatk.refdata;
|
|||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.RODMetaDataContainer;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
|
|
@ -37,10 +38,7 @@ import org.junit.Test;
|
|||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.PriorityQueue;
|
||||
import java.util.Set;
|
||||
import java.util.TreeMap;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
|
|
@ -56,6 +54,7 @@ public class ReadMetaDataTrackerTest extends BaseTest {
|
|||
private static int readCount = 100;
|
||||
private static int DEFAULT_READ_LENGTH = ArtificialSAMUtils.DEFAULT_READ_LENGTH;
|
||||
private static SAMFileHeader header;
|
||||
private Set<String> nameSet;
|
||||
|
||||
@BeforeClass
|
||||
public static void beforeClass() {
|
||||
|
|
@ -65,15 +64,32 @@ public class ReadMetaDataTrackerTest extends BaseTest {
|
|||
|
||||
@Before
|
||||
public void beforeEach() {
|
||||
nameSet = new TreeSet<String>();
|
||||
nameSet.add("default");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void twoRodsAtEachReadBase() {
|
||||
nameSet.add("default2");
|
||||
ReadMetaDataTracker tracker = getRMDT(1, nameSet, true);
|
||||
|
||||
// count the positions
|
||||
int count = 0;
|
||||
for (Long x : tracker.getPositionMapping().keySet()) {
|
||||
count++;
|
||||
Assert.assertEquals(2, tracker.getPositionMapping().get(x).size());
|
||||
}
|
||||
Assert.assertEquals(10, count);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void rodAtEachReadBase() {
|
||||
ReadMetaDataTracker tracker = getRMDT(1);
|
||||
|
||||
ReadMetaDataTracker tracker = getRMDT(1, nameSet, true);
|
||||
|
||||
// count the positions
|
||||
int count = 0;
|
||||
for (int x : tracker.getPositionMapping().keySet()) {
|
||||
for (Long x : tracker.getPositionMapping().keySet()) {
|
||||
count++;
|
||||
Assert.assertEquals(1, tracker.getPositionMapping().get(x).size());
|
||||
}
|
||||
|
|
@ -81,12 +97,78 @@ public class ReadMetaDataTrackerTest extends BaseTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void sparceRODsForRead() {
|
||||
ReadMetaDataTracker tracker = getRMDT(7);
|
||||
public void filterByName() {
|
||||
nameSet.add("default2");
|
||||
ReadMetaDataTracker tracker = getRMDT(1, nameSet, true);
|
||||
|
||||
// count the positions
|
||||
int count = 0;
|
||||
for (int x : tracker.getPositionMapping().keySet()) {
|
||||
Map<Long, Collection<ReferenceOrderedDatum>> map = tracker.getPositionMapping("default");
|
||||
for (Long x : map.keySet()) {
|
||||
count++;
|
||||
Assert.assertEquals(1, map.get(x).size());
|
||||
}
|
||||
Assert.assertEquals(10, count);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void filterByDupType() {
|
||||
nameSet.add("default2");
|
||||
ReadMetaDataTracker tracker = getRMDT(1, nameSet, false); // create both RODs of the same type
|
||||
// count the positions
|
||||
int count = 0;
|
||||
Map<Long, Collection<ReferenceOrderedDatum>> map = tracker.getPositionMapping(FakeRODatum.class);
|
||||
for (Long x : map.keySet()) {
|
||||
count++;
|
||||
Assert.assertEquals(2, map.get(x).size());
|
||||
}
|
||||
Assert.assertEquals(10, count);
|
||||
}
|
||||
|
||||
// @Test this test can be uncommented to determine the speed impacts of any changes to the RODs for reads system
|
||||
public void filterByMassiveDupType() {
|
||||
|
||||
for (int y = 0; y < 20; y++) {
|
||||
nameSet.add("default" + String.valueOf(y));
|
||||
long firstTime = System.currentTimeMillis();
|
||||
for (int lp = 0; lp < 1000; lp++) {
|
||||
ReadMetaDataTracker tracker = getRMDT(1, nameSet, false); // create both RODs of the same type
|
||||
// count the positions
|
||||
int count = 0;
|
||||
Map<Long, Collection<ReferenceOrderedDatum>> map = tracker.getPositionMapping(FakeRODatum.class);
|
||||
for (Long x : map.keySet()) {
|
||||
count++;
|
||||
Assert.assertEquals(y + 2, map.get(x).size());
|
||||
}
|
||||
Assert.assertEquals(10, count);
|
||||
}
|
||||
System.err.println(y + " = " + (System.currentTimeMillis() - firstTime));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void filterByType() {
|
||||
nameSet.add("default2");
|
||||
ReadMetaDataTracker tracker = getRMDT(1, nameSet, true);
|
||||
|
||||
// count the positions
|
||||
int count = 0;
|
||||
Map<Long, Collection<ReferenceOrderedDatum>> map = tracker.getPositionMapping(Fake2RODatum.class);
|
||||
for (long x : map.keySet()) {
|
||||
count++;
|
||||
Assert.assertEquals(1, map.get(x).size());
|
||||
}
|
||||
Assert.assertEquals(10, count);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void sparceRODsForRead() {
|
||||
ReadMetaDataTracker tracker = getRMDT(7, nameSet, true);
|
||||
|
||||
// count the positions
|
||||
int count = 0;
|
||||
for (Long x : tracker.getPositionMapping().keySet()) {
|
||||
count++;
|
||||
Assert.assertEquals(1, tracker.getPositionMapping().get(x).size());
|
||||
}
|
||||
|
|
@ -95,7 +177,7 @@ public class ReadMetaDataTrackerTest extends BaseTest {
|
|||
|
||||
@Test
|
||||
public void rodByGenomeLoc() {
|
||||
ReadMetaDataTracker tracker = getRMDT(1);
|
||||
ReadMetaDataTracker tracker = getRMDT(1, nameSet, true);
|
||||
|
||||
// count the positions
|
||||
int count = 0;
|
||||
|
|
@ -106,32 +188,62 @@ public class ReadMetaDataTrackerTest extends BaseTest {
|
|||
Assert.assertEquals(10, count);
|
||||
}
|
||||
|
||||
private ReadMetaDataTracker getRMDT(int incr) {
|
||||
SAMRecord record = ArtificialSAMUtils.createArtificialRead(header, "name", 0, 1, 10);
|
||||
TreeMap<Long, Set<ReferenceOrderedDatum>> data = new TreeMap<Long, Set<ReferenceOrderedDatum>>();
|
||||
for (int x = 0; x < record.getAlignmentEnd(); x+=incr) {
|
||||
|
||||
/**
|
||||
* create a ReadMetaDataTracker given:
|
||||
*
|
||||
* @param incr the spacing between site locations
|
||||
* @param names the names of the reference ordered data to create: one will be created at every location for each name
|
||||
*
|
||||
* @return a ReadMetaDataTracker
|
||||
*/
|
||||
private ReadMetaDataTracker getRMDT(int incr, Set<String> names, boolean alternateTypes) {
|
||||
SAMRecord record = ArtificialSAMUtils.createArtificialRead(header, "name", 0, 1, 10);
|
||||
TreeMap<Long, RODMetaDataContainer> data = new TreeMap<Long, RODMetaDataContainer>();
|
||||
for (int x = 0; x < record.getAlignmentEnd(); x += incr) {
|
||||
GenomeLoc loc = GenomeLocParser.createGenomeLoc(record.getReferenceIndex(), record.getAlignmentStart() + x, record.getAlignmentStart() + x);
|
||||
Set<ReferenceOrderedDatum> set = new HashSet<ReferenceOrderedDatum>();
|
||||
set.add(new FakeRODatum(loc));
|
||||
data.put((long)record.getAlignmentStart() + x,set);
|
||||
RODMetaDataContainer set = new RODMetaDataContainer();
|
||||
|
||||
int cnt = 0;
|
||||
for (String name : names) {
|
||||
if (alternateTypes)
|
||||
set.addEntry((cnt % 2 == 0) ? new FakeRODatum(loc, name) : new Fake2RODatum(loc, name));
|
||||
else
|
||||
set.addEntry(new FakeRODatum(loc, name));
|
||||
cnt++;
|
||||
}
|
||||
data.put((long) record.getAlignmentStart() + x, set);
|
||||
}
|
||||
ReadMetaDataTracker tracker = new ReadMetaDataTracker(record, data);
|
||||
return tracker;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* for testing, we want a fake rod with a different classname, for the get-by-class-name functions
|
||||
*/
|
||||
static public class Fake2RODatum extends FakeRODatum {
|
||||
|
||||
public Fake2RODatum(GenomeLoc location, String name) {
|
||||
super(location, name);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/** for testing only */
|
||||
static public class FakeRODatum implements ReferenceOrderedDatum {
|
||||
|
||||
final GenomeLoc location;
|
||||
final String name;
|
||||
|
||||
public FakeRODatum(GenomeLoc location) {
|
||||
public FakeRODatum(GenomeLoc location, String name) {
|
||||
this.location = location;
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getName() {
|
||||
return "false";
|
||||
return name;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
|||
|
|
@ -23,13 +23,18 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.refdata.tracks;
|
||||
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
|
|
@ -57,10 +62,7 @@ public class RMDTrackManagerTest extends BaseTest {
|
|||
|
||||
@Test
|
||||
public void testBuilderQuery() {
|
||||
|
||||
|
||||
for (RMDTrack t : tracks) {
|
||||
|
||||
System.err.println("name = " + t.getName() + " type = " + t.getType().getSimpleName() + " file = " + t.getFile());
|
||||
int count = 0;
|
||||
Iterator<GATKFeature> fIter;
|
||||
|
|
@ -75,24 +77,62 @@ public class RMDTrackManagerTest extends BaseTest {
|
|||
}
|
||||
Assert.assertEquals(100, count);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBuilderIterator() {
|
||||
for (RMDTrack t : tracks) {
|
||||
|
||||
System.err.println("name = " + t.getName() + " type = " + t.getType().getSimpleName() + " file = " + t.getFile());
|
||||
int count = 0;
|
||||
Iterator<GATKFeature> fIter = null;
|
||||
fIter = t.getIterator();
|
||||
Iterator<GATKFeature> fIter = t.getIterator();
|
||||
while (fIter.hasNext()) {
|
||||
fIter.next();
|
||||
count++;
|
||||
}
|
||||
Assert.assertEquals(100, count);
|
||||
}
|
||||
}
|
||||
|
||||
// @Test used only to determine how fast queries are, don't uncomment! (unless you know what you're doing).
|
||||
public void testSpeedOfRealQuery() {
|
||||
IndexedFastaSequenceFile file = null;
|
||||
try {
|
||||
file = new IndexedFastaSequenceFile(new File("/broad/1KG/reference/human_b36_both.fasta"));
|
||||
} catch (FileNotFoundException e) {
|
||||
Assert.assertTrue(false);
|
||||
}
|
||||
final int intervalSize = 10000000;
|
||||
GenomeLocParser.setupRefContigOrdering(file.getSequenceDictionary());
|
||||
RMDTrackManager manager = new RMDTrackManager();
|
||||
// add our db snp data
|
||||
triplets.clear();
|
||||
triplets.add("db");
|
||||
triplets.add("DBSNP");
|
||||
triplets.add("../../GATK_Data/dbsnp_130_b36.rod");
|
||||
Assert.assertEquals(1, manager.getReferenceMetaDataSources(triplets).size());
|
||||
RMDTrack t = manager.getReferenceMetaDataSources(triplets).get(0);
|
||||
// make sure we have a single track
|
||||
// lets test the first and 20th contigs of the human reference
|
||||
|
||||
for (int loop = 1; loop <= 22; loop++) {
|
||||
SAMSequenceRecord seqRec = GenomeLocParser.getContigInfo(String.valueOf(loop));
|
||||
String name = seqRec.getSequenceName();
|
||||
Iterator<GATKFeature> fIter;
|
||||
for (int x = 1; x < seqRec.getSequenceLength() - intervalSize; x += intervalSize) {
|
||||
long firstTime = System.currentTimeMillis();
|
||||
long count = 0;
|
||||
try {
|
||||
fIter = ((FeatureReaderTrack) t).query("1", x, x + intervalSize);
|
||||
} catch (IOException e) {
|
||||
throw new StingException("blah I/O exception");
|
||||
}
|
||||
while (fIter.hasNext()) {
|
||||
fIter.next();
|
||||
count++;
|
||||
}
|
||||
System.err.println(name + "," + count + "," + (System.currentTimeMillis() - firstTime));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue