The building blocks for segmenting covariate counting data by read group.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@899 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
13be846c2a
commit
40bc4ae39a
|
|
@ -56,11 +56,13 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
long B;
|
||||
int pos;
|
||||
int qual;
|
||||
String readGroup;
|
||||
String dinuc;
|
||||
|
||||
public RecalData(int pos, int qual, String dinuc ) {
|
||||
public RecalData(int pos, int qual, String readGroup, String dinuc ) {
|
||||
this.pos = pos;
|
||||
this.qual = qual;
|
||||
this.readGroup = readGroup;
|
||||
this.dinuc = dinuc;
|
||||
}
|
||||
|
||||
|
|
@ -95,7 +97,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
for ( int j = 0; j < MAX_QUAL_SCORE+1; j++) {
|
||||
for ( int k = 0; k < NDINUCS; k++) {
|
||||
String dinuc = dinucIndex2bases(k);
|
||||
RecalData datum = new RecalData(i, j, dinuc);
|
||||
RecalData datum = new RecalData(i, j, readGroup.getReadGroupId(), dinuc);
|
||||
data.get(readGroup.getReadGroupId())[i][j][k] = datum;
|
||||
flattenData.add(datum);
|
||||
}
|
||||
|
|
@ -115,8 +117,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
if ( "ILLUMINA".equalsIgnoreCase(readGroup.getAttribute("PL").toString()) &&
|
||||
!read.getReadNegativeStrandFlag() &&
|
||||
(READ_GROUP.equals("none") || read.getAttribute("RG") != null && read.getAttribute("RG").equals(READ_GROUP)) &&
|
||||
(read.getMappingQuality() >= MIN_MAPPING_QUALITY) &&
|
||||
(DOWNSAMPLE_FRACTION == 1.0 || random_genrator.nextFloat() < DOWNSAMPLE_FRACTION)) {
|
||||
(read.getMappingQuality() >= MIN_MAPPING_QUALITY)) {
|
||||
//(random_genrator.nextFloat() <= DOWNSAMPLE_FRACTION)
|
||||
int offset = offsets.get(i);
|
||||
int numBases = read.getReadLength();
|
||||
|
|
@ -178,7 +179,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
|
||||
for ( int dinuc_index=0; dinuc_index<NDINUCS; dinuc_index++) {
|
||||
for ( RecalData datum: flattenData ) {
|
||||
if (string2dinucIndex(datum.dinuc) == dinuc_index) {
|
||||
if (datum.readGroup.equals(readGroup.getReadGroupId()) && string2dinucIndex(datum.dinuc) == dinuc_index) {
|
||||
if ((datum.N - datum.B) > 0)
|
||||
dinuc_out.format("%s,%s,%d,%d,%d,%d%n", readGroup.getReadGroupId(), dinucIndex2bases(dinuc_index), datum.qual, datum.pos, 0, datum.N - datum.B);
|
||||
if (datum.B > 0)
|
||||
|
|
@ -229,10 +230,10 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
ArrayList<RecalData> ByCycle = new ArrayList<RecalData>();
|
||||
ArrayList<MeanReportedQuality> ByCycleReportedQ = new ArrayList<MeanReportedQuality>();
|
||||
ByCycleFile.printf("cycle,Qemp-obs,Qemp,Qobs,B,N%n");
|
||||
RecalData All = new RecalData(0,0,"");
|
||||
RecalData All = new RecalData(0,0,readGroup.getReadGroupId(),"");
|
||||
MeanReportedQuality AllReported = new MeanReportedQuality();
|
||||
for (int c=0; c < MAX_READ_LENGTH+1; c++) {
|
||||
ByCycle.add(new RecalData(c, -1, "-"));
|
||||
ByCycle.add(new RecalData(c, -1,readGroup.getReadGroupId(),"-"));
|
||||
ByCycleReportedQ.add(new MeanReportedQuality());
|
||||
}
|
||||
|
||||
|
|
@ -265,10 +266,10 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
ArrayList<RecalData> ByCycle = new ArrayList<RecalData>();
|
||||
ArrayList<MeanReportedQuality> ByCycleReportedQ = new ArrayList<MeanReportedQuality>();
|
||||
ByDinucFile.printf("dinuc,Qemp-obs,Qemp,Qobs,B,N%n");
|
||||
RecalData All = new RecalData(0,0,"");
|
||||
RecalData All = new RecalData(0,0,readGroup.getReadGroupId(),"");
|
||||
MeanReportedQuality AllReported = new MeanReportedQuality();
|
||||
for (int c=0; c < NDINUCS; c++) {
|
||||
ByCycle.add(new RecalData(-1, -1, dinucIndex2bases(c)));
|
||||
ByCycle.add(new RecalData(-1, -1,readGroup.getReadGroupId(),dinucIndex2bases(c)));
|
||||
ByCycleReportedQ.add(new MeanReportedQuality());
|
||||
}
|
||||
|
||||
|
|
@ -302,10 +303,10 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
ArrayList<RecalData> ByQ = new ArrayList<RecalData>();
|
||||
ArrayList<MeanReportedQuality> ByQReportedQ = new ArrayList<MeanReportedQuality>();
|
||||
ByQualFile.printf("Qrep,Qemp,Qrep_avg,B,N%n");
|
||||
RecalData All = new RecalData(0,0,"");
|
||||
RecalData All = new RecalData(0,0,readGroup.getReadGroupId(),"");
|
||||
MeanReportedQuality AllReported = new MeanReportedQuality();
|
||||
for (int q=0; q<MAX_QUAL_SCORE+1; q++) {
|
||||
ByQ.add(new RecalData(-1,q,"-"));
|
||||
ByQ.add(new RecalData(-1,q,readGroup.getReadGroupId(),"-"));
|
||||
ByQReportedQ.add(new MeanReportedQuality());
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ public class LogisticRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWr
|
|||
@Argument(shortName="outputBAM", doc="output BAM file", required=false)
|
||||
public String outputBamFile = null;
|
||||
|
||||
HashMap<String, LogisticRegressor> regressors = new HashMap<String, LogisticRegressor>();
|
||||
Map<Pair<String,String>, LogisticRegressor> regressors = new HashMap<Pair<String,String>, LogisticRegressor>();
|
||||
private static Logger logger = Logger.getLogger(LogisticRecalibrationWalker.class);
|
||||
|
||||
public void initialize() {
|
||||
|
|
@ -29,26 +29,28 @@ public class LogisticRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWr
|
|||
for ( String line : lines.subList(1,lines.size()) ) {
|
||||
// dinuc coeff1 coeff2 ... coeff25
|
||||
String[] vals = line.split("\\s+");
|
||||
String dinuc = vals[0];
|
||||
String readGroup = vals[0];
|
||||
String dinuc = vals[1];
|
||||
LogisticRegressor regressor = new LogisticRegressor(2, 4);
|
||||
regressors.put(dinuc, regressor);
|
||||
regressors.put(new Pair<String,String>(readGroup,dinuc), regressor);
|
||||
System.out.printf("Vals = %s%n", Utils.join(",", vals));
|
||||
for ( int i = 1; i < vals.length; i++ ) {
|
||||
for ( int i = 2; i <= (vals.length-2); i++ ) {
|
||||
Pair<Integer, Integer> ij = mapping.get(i-1);
|
||||
try {
|
||||
double c = Double.parseDouble(vals[i]);
|
||||
regressor.setCoefficient(ij.first, ij.second, c);
|
||||
System.out.printf("Setting coefficient %s => %s = %f%n", dinuc, ij, c);
|
||||
System.out.printf("Setting coefficient %s,%s => %s = %f%n", readGroup, dinuc, ij, c);
|
||||
} catch ( NumberFormatException e ) {
|
||||
Utils.scareUser("Badly formed logistic regression header at " + vals[i] + " line: " + line );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for ( Map.Entry<String, LogisticRegressor> e : regressors.entrySet() ) {
|
||||
String dinuc = e.getKey();
|
||||
for ( Map.Entry<Pair<String,String>, LogisticRegressor> e : regressors.entrySet() ) {
|
||||
String readGroup = e.getKey().first;
|
||||
String dinuc = e.getKey().second;
|
||||
LogisticRegressor regressor = e.getValue();
|
||||
logger.debug(String.format("Regressor: %s => %s", dinuc, regressor));
|
||||
logger.debug(String.format("Regressor: %s,%s => %s", readGroup, dinuc, regressor));
|
||||
}
|
||||
|
||||
//System.exit(1);
|
||||
|
|
@ -62,10 +64,12 @@ public class LogisticRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWr
|
|||
ArrayList<Pair<Integer, Integer>> mapping = new ArrayList<Pair<Integer, Integer>>();
|
||||
|
||||
String[] elts = headerLine.split("\\s+");
|
||||
if ( ! elts[0].toLowerCase().startsWith("dinuc") ) // checking only start of first field because dinuc will be followed by a version number to be checekde later
|
||||
Utils.scareUser("Badly formatted Logistic regression header, upper left keyword must be dinuc: " + elts[0] + " line: " + headerLine);
|
||||
if ( ! "rg".equalsIgnoreCase(elts[0]) )
|
||||
Utils.scareUser("Badly formatted Logistic regression header, upper left keyword must be rg: " + elts[0] + " line: " + headerLine);
|
||||
if ( ! elts[1].toLowerCase().startsWith("dinuc") ) // checking only start of first field because dinuc will be followed by a version number to be checekde later
|
||||
Utils.scareUser("Badly formatted Logistic regression header, second left keyword must be dinuc: " + elts[1] + " line: " + headerLine);
|
||||
|
||||
for ( int k = 1; k < elts.length; k++ ) {
|
||||
for ( int k = 2; k < elts.length; k++ ) {
|
||||
String paramStr = elts[k];
|
||||
String[] ij = paramStr.split(",");
|
||||
if ( ij.length != 2 ) {
|
||||
|
|
@ -87,6 +91,7 @@ public class LogisticRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWr
|
|||
|
||||
|
||||
public SAMRecord map(char[] ref, SAMRecord read) {
|
||||
System.out.println(String.format("Reading: %s (%s:%d-%d)",read.getReadName(),read.getReferenceName(),read.getAlignmentStart(),read.getAlignmentEnd()));
|
||||
SAMRecord recalRead = read;
|
||||
byte[] bases = read.getReadBases();
|
||||
byte[] quals = read.getBaseQualities();
|
||||
|
|
@ -98,6 +103,7 @@ public class LogisticRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWr
|
|||
quals = BaseUtils.reverse(quals);
|
||||
}
|
||||
|
||||
String readGroup = read.getAttribute("RG").toString();
|
||||
int numBases = read.getReadLength();
|
||||
recalQuals[0] = quals[0]; // can't change the first -- no dinuc
|
||||
//recalQuals[numBases-1] = quals[numBases-1]; // can't change last -- no dinuc
|
||||
|
|
@ -106,7 +112,7 @@ public class LogisticRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWr
|
|||
//int cycle = i; //read.getReadNegativeStrandFlag() ? numBases - i - 1 : i;
|
||||
String dinuc = String.format("%c%c", bases[cycle - 1], bases[cycle]);
|
||||
byte qual = quals[cycle];
|
||||
LogisticRegressor regressor = regressors.get(dinuc);
|
||||
LogisticRegressor regressor = regressors.get(new Pair<String,String>(readGroup,dinuc));
|
||||
byte newQual;
|
||||
|
||||
if ( regressor != null ) { // no N or some other unexpected bp in the stream
|
||||
|
|
|
|||
|
|
@ -20,6 +20,48 @@ public class Pair<X,Y> {
|
|||
*/
|
||||
public Y getSecond() { return second; }
|
||||
|
||||
/**
|
||||
* Calculate whether this pair object is equal to another object.
|
||||
* @param o The other object (hopefully a pair).
|
||||
* @return True if the two are equal; false otherwise.
|
||||
*/
|
||||
@Override
|
||||
public boolean equals( Object o ) {
|
||||
if( o == null )
|
||||
return false;
|
||||
if( !(o instanceof Pair) )
|
||||
return false;
|
||||
|
||||
Pair other = (Pair)o;
|
||||
|
||||
// Check to see whether one is null but not the other.
|
||||
if( this.first == null && other.first != null ) return false;
|
||||
if( this.second == null && other.second != null ) return false;
|
||||
|
||||
// Check to see whether the values are equal.
|
||||
// If the param of equals is null, it should by contract return false.
|
||||
if( this.first != null && !this.first.equals(other.first) ) return false;
|
||||
if( this.second != null && !this.second.equals(other.second) ) return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Basic hashcode function. Assume hashcodes of first and second are
|
||||
* randomly distributed and return the XOR of the two.
|
||||
* @return Randomly distributed hashcode of the pair.
|
||||
*/
|
||||
@Override
|
||||
public int hashCode() {
|
||||
if( second == null && first == null )
|
||||
return 0;
|
||||
if( second == null )
|
||||
return first.hashCode();
|
||||
if( first == null )
|
||||
return second.hashCode();
|
||||
return first.hashCode() ^ second.hashCode();
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return first+","+second;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,97 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
import os,sys
|
||||
|
||||
R_exe="/broad/tools/apps/R-2.6.0/bin/Rscript"
|
||||
logistic_regression_script="/humgen/gsa-scr1/hanna/src/StingWorking/R/logistic_regression.R"
|
||||
|
||||
def exit(msg,errorlevel):
|
||||
"Exit the program with the specified message and error code."
|
||||
print msg
|
||||
sys.exit(errorlevel)
|
||||
|
||||
def open_source_file(source_filename):
|
||||
"Open the source file with the given name. Make sure it's readable."
|
||||
if not os.access(source_filename,os.R_OK):
|
||||
exit("Unable to read covariate counts file '" + sys.argv[1] + "'",1)
|
||||
if not source_filename.endswith('.csv'):
|
||||
exit("Source file is in incorrect format. Must be csv.")
|
||||
return open(source_filename,'r')
|
||||
|
||||
def read_header(source_file):
|
||||
"Read the header from the given source file. Do basic validation."
|
||||
header = source_file.readline().split(',')
|
||||
if header[0] != 'rg' or header[1] != 'dn':
|
||||
exit("Input file is in invalid format. First two columns should be <read group>,<dinucleotide>",1)
|
||||
return header
|
||||
|
||||
def create_intermediate_file(source_file,read_group,dinuc):
|
||||
"Create an intermediate file for a particular read group / dinuc from a given source file"
|
||||
base = source_file.name[:source_file.name.rfind('.csv')]
|
||||
intermediate_filename = "%s.%s.%s.csv" % (base,read_group,dinuc)
|
||||
intermediate_file = open(intermediate_filename,"w")
|
||||
intermediate_file.write(','.join(header[2:]))
|
||||
return intermediate_file
|
||||
|
||||
def open_target_file(target_filename):
|
||||
"Open a target file and write out the header."
|
||||
target_file = open(target_filename,'w')
|
||||
target_file.write("rg\tdinuc\t")
|
||||
for p in range(5):
|
||||
for q in range(5):
|
||||
target_file.write("%d,%d\t" % (p,q))
|
||||
target_file.write("\n")
|
||||
return target_file
|
||||
|
||||
def process_file(source_file,read_group,dinuc,target_file):
|
||||
base = source_file.name[:source_file.name.rfind('.csv')] + '.' + read_group
|
||||
regression_command = ' '.join((R_exe,logistic_regression_script,base,base,dinuc))
|
||||
print "Running " + regression_command
|
||||
os.system(regression_command)
|
||||
parameters_filename = '.'.join((base,dinuc,'parameters'))
|
||||
if not os.access(parameters_filename,os.R_OK):
|
||||
exit("Unable to read output of R from file " + parameters_filename)
|
||||
parameters_file = open(parameters_filename,'r')
|
||||
parameters = ' '.join([line.rstrip('\n') for line in parameters_file]).split(' ')
|
||||
target_file.write('\t'.join([read_group,dinuc]+parameters)+'\n')
|
||||
os.remove('.'.join((base,dinuc,'csv')))
|
||||
os.remove('.'.join((base,dinuc,'parameters')))
|
||||
|
||||
class LogisticRegressionRunner:
|
||||
def __init__(self,source_filename,read_group,dinuc):
|
||||
__base = source_file.name[:source_file.name.rfind('.csv')]
|
||||
__read_group = read_group
|
||||
__dinuc = dinuc
|
||||
|
||||
|
||||
if len(sys.argv) < 3:
|
||||
exit("Usage: logistic_regression <covariate count input csv> <parameters csv>",1)
|
||||
|
||||
source_file = open_source_file(sys.argv[1])
|
||||
target_file = open_target_file(sys.argv[2])
|
||||
|
||||
header = read_header(source_file)
|
||||
|
||||
intermediate_file = None
|
||||
read_group = None
|
||||
dinuc = None
|
||||
|
||||
for data_line in source_file:
|
||||
data_line = data_line.strip()
|
||||
if len(data_line) == 0:
|
||||
continue
|
||||
data = data_line.split(',')
|
||||
if read_group != data[0] or dinuc != data[1]:
|
||||
if intermediate_file:
|
||||
intermediate_file.close()
|
||||
process_file(source_file,read_group,dinuc,target_file)
|
||||
read_group,dinuc = data[0:2]
|
||||
intermediate_file = create_intermediate_file(source_file,read_group,dinuc)
|
||||
intermediate_file.write(','.join(data[2:])+'\n')
|
||||
|
||||
if intermediate_file:
|
||||
intermediate_file.close()
|
||||
process_file(source_file,read_group,dinuc,target_file)
|
||||
|
||||
source_file.close()
|
||||
target_file.close()
|
||||
Loading…
Reference in New Issue