2009-10-06 10:45:31 +08:00
|
|
|
package org.broadinstitute.sting.utils.bed;
|
|
|
|
|
|
2010-09-12 22:02:43 +08:00
|
|
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
2009-10-06 10:45:31 +08:00
|
|
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
|
|
|
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
|
|
|
|
|
|
|
|
|
import java.io.*;
|
|
|
|
|
import java.util.*;
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Created by IntelliJ IDEA.
|
|
|
|
|
* User: aaron
|
|
|
|
|
* Date: Oct 5, 2009
|
|
|
|
|
* Time: 5:46:45 PM
|
|
|
|
|
*/
|
|
|
|
|
public class BedParser {
|
|
|
|
|
// the GATk operates as a one based location, bed files are 0 based
|
|
|
|
|
static final int TO_ONE_BASED_ADDITION = 1;
|
|
|
|
|
|
|
|
|
|
// the buffered reader input
|
|
|
|
|
private final BufferedReader mIn;
|
|
|
|
|
|
2010-11-11 01:59:50 +08:00
|
|
|
private GenomeLocParser genomeLocParser;
|
|
|
|
|
|
2009-10-06 10:45:31 +08:00
|
|
|
// our array of locations
|
|
|
|
|
private List<GenomeLoc> mLocations;
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* parse a bed file, given it's location
|
|
|
|
|
*
|
|
|
|
|
* @param fl
|
|
|
|
|
*/
|
2010-11-11 01:59:50 +08:00
|
|
|
public BedParser(GenomeLocParser genomeLocParser,File fl) {
|
|
|
|
|
this.genomeLocParser = genomeLocParser;
|
2009-10-06 10:45:31 +08:00
|
|
|
try {
|
|
|
|
|
mIn = new BufferedReader(new FileReader(fl));
|
|
|
|
|
} catch (FileNotFoundException e) {
|
2010-09-12 22:02:43 +08:00
|
|
|
throw new UserException.CouldNotReadInputFile(fl, e);
|
2009-10-06 10:45:31 +08:00
|
|
|
}
|
|
|
|
|
mLocations = parseLocations();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* parse a bed file, given an input reader
|
|
|
|
|
*
|
|
|
|
|
* @param fl the bed file
|
|
|
|
|
*/
|
|
|
|
|
public BedParser(BufferedReader fl) {
|
|
|
|
|
mIn = fl;
|
|
|
|
|
mLocations = parseLocations();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* parse out the locations
|
|
|
|
|
*
|
|
|
|
|
* @return a list of GenomeLocs, sorted and merged
|
|
|
|
|
*/
|
|
|
|
|
private List<GenomeLoc> parseLocations() {
|
|
|
|
|
String line = null;
|
|
|
|
|
List<GenomeLoc> locArray = new ArrayList<GenomeLoc>();
|
|
|
|
|
try {
|
|
|
|
|
while ((line = mIn.readLine()) != null) {
|
2010-11-11 01:59:50 +08:00
|
|
|
locArray.add(parseLocation(genomeLocParser,line));
|
2009-10-06 10:45:31 +08:00
|
|
|
}
|
|
|
|
|
} catch (IOException e) {
|
2010-09-12 22:02:43 +08:00
|
|
|
throw new UserException.MalformedFile("Unable to parse line in BED file.");
|
2009-10-06 10:45:31 +08:00
|
|
|
}
|
|
|
|
|
return locArray;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* parse a single location
|
|
|
|
|
*
|
|
|
|
|
* @param line the line, as a string
|
|
|
|
|
* @return a parsed genome loc
|
|
|
|
|
*/
|
2010-11-11 01:59:50 +08:00
|
|
|
public static GenomeLoc parseLocation(GenomeLocParser genomeLocParser,String line) {
|
2009-10-06 10:45:31 +08:00
|
|
|
String contig;
|
|
|
|
|
int start;
|
|
|
|
|
int stop;
|
|
|
|
|
try {
|
|
|
|
|
String parts[] = line.split("\\s+");
|
|
|
|
|
contig = parts[0];
|
|
|
|
|
start = Integer.valueOf(parts[1]) + TO_ONE_BASED_ADDITION;
|
|
|
|
|
stop = Integer.valueOf(parts[2]); // the ending point is an open interval
|
|
|
|
|
} catch (Exception e) {
|
2010-09-12 22:02:43 +08:00
|
|
|
throw new UserException.MalformedFile("Unable to process bed file line = " + line, e);
|
2009-10-06 10:45:31 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// we currently drop the rest of the bed record, which can contain names, scores, etc
|
2011-05-21 10:01:59 +08:00
|
|
|
return genomeLocParser.createGenomeLoc(contig, start, stop, true);
|
2009-10-06 10:45:31 +08:00
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* return the sorted, and merged (for overlapping regions)
|
|
|
|
|
*
|
|
|
|
|
* @return an arraylist
|
|
|
|
|
*/
|
|
|
|
|
public List<GenomeLoc> getLocations() {
|
|
|
|
|
return mLocations;
|
|
|
|
|
}
|
|
|
|
|
}
|