The building blocks for segmenting covariate counting data by read group.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@899 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2009-06-04 19:55:24 +00:00
parent 13be846c2a
commit 40bc4ae39a
4 changed files with 169 additions and 23 deletions

View File

@ -56,11 +56,13 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
long B;
int pos;
int qual;
String readGroup;
String dinuc;
public RecalData(int pos, int qual, String dinuc ) {
public RecalData(int pos, int qual, String readGroup, String dinuc ) {
this.pos = pos;
this.qual = qual;
this.readGroup = readGroup;
this.dinuc = dinuc;
}
@ -95,7 +97,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
for ( int j = 0; j < MAX_QUAL_SCORE+1; j++) {
for ( int k = 0; k < NDINUCS; k++) {
String dinuc = dinucIndex2bases(k);
RecalData datum = new RecalData(i, j, dinuc);
RecalData datum = new RecalData(i, j, readGroup.getReadGroupId(), dinuc);
data.get(readGroup.getReadGroupId())[i][j][k] = datum;
flattenData.add(datum);
}
@ -115,8 +117,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
if ( "ILLUMINA".equalsIgnoreCase(readGroup.getAttribute("PL").toString()) &&
!read.getReadNegativeStrandFlag() &&
(READ_GROUP.equals("none") || read.getAttribute("RG") != null && read.getAttribute("RG").equals(READ_GROUP)) &&
(read.getMappingQuality() >= MIN_MAPPING_QUALITY) &&
(DOWNSAMPLE_FRACTION == 1.0 || random_genrator.nextFloat() < DOWNSAMPLE_FRACTION)) {
(read.getMappingQuality() >= MIN_MAPPING_QUALITY)) {
//(random_genrator.nextFloat() <= DOWNSAMPLE_FRACTION)
int offset = offsets.get(i);
int numBases = read.getReadLength();
@ -178,7 +179,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
for ( int dinuc_index=0; dinuc_index<NDINUCS; dinuc_index++) {
for ( RecalData datum: flattenData ) {
if (string2dinucIndex(datum.dinuc) == dinuc_index) {
if (datum.readGroup.equals(readGroup.getReadGroupId()) && string2dinucIndex(datum.dinuc) == dinuc_index) {
if ((datum.N - datum.B) > 0)
dinuc_out.format("%s,%s,%d,%d,%d,%d%n", readGroup.getReadGroupId(), dinucIndex2bases(dinuc_index), datum.qual, datum.pos, 0, datum.N - datum.B);
if (datum.B > 0)
@ -229,10 +230,10 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
ArrayList<RecalData> ByCycle = new ArrayList<RecalData>();
ArrayList<MeanReportedQuality> ByCycleReportedQ = new ArrayList<MeanReportedQuality>();
ByCycleFile.printf("cycle,Qemp-obs,Qemp,Qobs,B,N%n");
RecalData All = new RecalData(0,0,"");
RecalData All = new RecalData(0,0,readGroup.getReadGroupId(),"");
MeanReportedQuality AllReported = new MeanReportedQuality();
for (int c=0; c < MAX_READ_LENGTH+1; c++) {
ByCycle.add(new RecalData(c, -1, "-"));
ByCycle.add(new RecalData(c, -1,readGroup.getReadGroupId(),"-"));
ByCycleReportedQ.add(new MeanReportedQuality());
}
@ -265,10 +266,10 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
ArrayList<RecalData> ByCycle = new ArrayList<RecalData>();
ArrayList<MeanReportedQuality> ByCycleReportedQ = new ArrayList<MeanReportedQuality>();
ByDinucFile.printf("dinuc,Qemp-obs,Qemp,Qobs,B,N%n");
RecalData All = new RecalData(0,0,"");
RecalData All = new RecalData(0,0,readGroup.getReadGroupId(),"");
MeanReportedQuality AllReported = new MeanReportedQuality();
for (int c=0; c < NDINUCS; c++) {
ByCycle.add(new RecalData(-1, -1, dinucIndex2bases(c)));
ByCycle.add(new RecalData(-1, -1,readGroup.getReadGroupId(),dinucIndex2bases(c)));
ByCycleReportedQ.add(new MeanReportedQuality());
}
@ -302,10 +303,10 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
ArrayList<RecalData> ByQ = new ArrayList<RecalData>();
ArrayList<MeanReportedQuality> ByQReportedQ = new ArrayList<MeanReportedQuality>();
ByQualFile.printf("Qrep,Qemp,Qrep_avg,B,N%n");
RecalData All = new RecalData(0,0,"");
RecalData All = new RecalData(0,0,readGroup.getReadGroupId(),"");
MeanReportedQuality AllReported = new MeanReportedQuality();
for (int q=0; q<MAX_QUAL_SCORE+1; q++) {
ByQ.add(new RecalData(-1,q,"-"));
ByQ.add(new RecalData(-1,q,readGroup.getReadGroupId(),"-"));
ByQReportedQ.add(new MeanReportedQuality());
}

View File

@ -19,7 +19,7 @@ public class LogisticRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWr
@Argument(shortName="outputBAM", doc="output BAM file", required=false)
public String outputBamFile = null;
HashMap<String, LogisticRegressor> regressors = new HashMap<String, LogisticRegressor>();
Map<Pair<String,String>, LogisticRegressor> regressors = new HashMap<Pair<String,String>, LogisticRegressor>();
private static Logger logger = Logger.getLogger(LogisticRecalibrationWalker.class);
public void initialize() {
@ -29,26 +29,28 @@ public class LogisticRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWr
for ( String line : lines.subList(1,lines.size()) ) {
// dinuc coeff1 coeff2 ... coeff25
String[] vals = line.split("\\s+");
String dinuc = vals[0];
String readGroup = vals[0];
String dinuc = vals[1];
LogisticRegressor regressor = new LogisticRegressor(2, 4);
regressors.put(dinuc, regressor);
regressors.put(new Pair<String,String>(readGroup,dinuc), regressor);
System.out.printf("Vals = %s%n", Utils.join(",", vals));
for ( int i = 1; i < vals.length; i++ ) {
for ( int i = 2; i <= (vals.length-2); i++ ) {
Pair<Integer, Integer> ij = mapping.get(i-1);
try {
double c = Double.parseDouble(vals[i]);
regressor.setCoefficient(ij.first, ij.second, c);
System.out.printf("Setting coefficient %s => %s = %f%n", dinuc, ij, c);
System.out.printf("Setting coefficient %s,%s => %s = %f%n", readGroup, dinuc, ij, c);
} catch ( NumberFormatException e ) {
Utils.scareUser("Badly formed logistic regression header at " + vals[i] + " line: " + line );
}
}
}
for ( Map.Entry<String, LogisticRegressor> e : regressors.entrySet() ) {
String dinuc = e.getKey();
for ( Map.Entry<Pair<String,String>, LogisticRegressor> e : regressors.entrySet() ) {
String readGroup = e.getKey().first;
String dinuc = e.getKey().second;
LogisticRegressor regressor = e.getValue();
logger.debug(String.format("Regressor: %s => %s", dinuc, regressor));
logger.debug(String.format("Regressor: %s,%s => %s", readGroup, dinuc, regressor));
}
//System.exit(1);
@ -62,10 +64,12 @@ public class LogisticRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWr
ArrayList<Pair<Integer, Integer>> mapping = new ArrayList<Pair<Integer, Integer>>();
String[] elts = headerLine.split("\\s+");
if ( ! elts[0].toLowerCase().startsWith("dinuc") ) // checking only start of first field because dinuc will be followed by a version number to be checekde later
Utils.scareUser("Badly formatted Logistic regression header, upper left keyword must be dinuc: " + elts[0] + " line: " + headerLine);
if ( ! "rg".equalsIgnoreCase(elts[0]) )
Utils.scareUser("Badly formatted Logistic regression header, upper left keyword must be rg: " + elts[0] + " line: " + headerLine);
if ( ! elts[1].toLowerCase().startsWith("dinuc") ) // checking only start of first field because dinuc will be followed by a version number to be checekde later
Utils.scareUser("Badly formatted Logistic regression header, second left keyword must be dinuc: " + elts[1] + " line: " + headerLine);
for ( int k = 1; k < elts.length; k++ ) {
for ( int k = 2; k < elts.length; k++ ) {
String paramStr = elts[k];
String[] ij = paramStr.split(",");
if ( ij.length != 2 ) {
@ -87,6 +91,7 @@ public class LogisticRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWr
public SAMRecord map(char[] ref, SAMRecord read) {
System.out.println(String.format("Reading: %s (%s:%d-%d)",read.getReadName(),read.getReferenceName(),read.getAlignmentStart(),read.getAlignmentEnd()));
SAMRecord recalRead = read;
byte[] bases = read.getReadBases();
byte[] quals = read.getBaseQualities();
@ -98,6 +103,7 @@ public class LogisticRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWr
quals = BaseUtils.reverse(quals);
}
String readGroup = read.getAttribute("RG").toString();
int numBases = read.getReadLength();
recalQuals[0] = quals[0]; // can't change the first -- no dinuc
//recalQuals[numBases-1] = quals[numBases-1]; // can't change last -- no dinuc
@ -106,7 +112,7 @@ public class LogisticRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWr
//int cycle = i; //read.getReadNegativeStrandFlag() ? numBases - i - 1 : i;
String dinuc = String.format("%c%c", bases[cycle - 1], bases[cycle]);
byte qual = quals[cycle];
LogisticRegressor regressor = regressors.get(dinuc);
LogisticRegressor regressor = regressors.get(new Pair<String,String>(readGroup,dinuc));
byte newQual;
if ( regressor != null ) { // no N or some other unexpected bp in the stream

View File

@ -20,6 +20,48 @@ public class Pair<X,Y> {
*/
public Y getSecond() { return second; }
/**
* Calculate whether this pair object is equal to another object.
* @param o The other object (hopefully a pair).
* @return True if the two are equal; false otherwise.
*/
@Override
public boolean equals( Object o ) {
if( o == null )
return false;
if( !(o instanceof Pair) )
return false;
Pair other = (Pair)o;
// Check to see whether one is null but not the other.
if( this.first == null && other.first != null ) return false;
if( this.second == null && other.second != null ) return false;
// Check to see whether the values are equal.
// If the param of equals is null, it should by contract return false.
if( this.first != null && !this.first.equals(other.first) ) return false;
if( this.second != null && !this.second.equals(other.second) ) return false;
return true;
}
/**
* Basic hashcode function. Assume hashcodes of first and second are
* randomly distributed and return the XOR of the two.
* @return Randomly distributed hashcode of the pair.
*/
@Override
public int hashCode() {
if( second == null && first == null )
return 0;
if( second == null )
return first.hashCode();
if( first == null )
return second.hashCode();
return first.hashCode() ^ second.hashCode();
}
public String toString() {
return first+","+second;
}

View File

@ -0,0 +1,97 @@
#!/usr/bin/env python
import os,sys
R_exe="/broad/tools/apps/R-2.6.0/bin/Rscript"
logistic_regression_script="/humgen/gsa-scr1/hanna/src/StingWorking/R/logistic_regression.R"
def exit(msg,errorlevel):
"Exit the program with the specified message and error code."
print msg
sys.exit(errorlevel)
def open_source_file(source_filename):
"Open the source file with the given name. Make sure it's readable."
if not os.access(source_filename,os.R_OK):
exit("Unable to read covariate counts file '" + sys.argv[1] + "'",1)
if not source_filename.endswith('.csv'):
exit("Source file is in incorrect format. Must be csv.")
return open(source_filename,'r')
def read_header(source_file):
"Read the header from the given source file. Do basic validation."
header = source_file.readline().split(',')
if header[0] != 'rg' or header[1] != 'dn':
exit("Input file is in invalid format. First two columns should be <read group>,<dinucleotide>",1)
return header
def create_intermediate_file(source_file,read_group,dinuc):
"Create an intermediate file for a particular read group / dinuc from a given source file"
base = source_file.name[:source_file.name.rfind('.csv')]
intermediate_filename = "%s.%s.%s.csv" % (base,read_group,dinuc)
intermediate_file = open(intermediate_filename,"w")
intermediate_file.write(','.join(header[2:]))
return intermediate_file
def open_target_file(target_filename):
"Open a target file and write out the header."
target_file = open(target_filename,'w')
target_file.write("rg\tdinuc\t")
for p in range(5):
for q in range(5):
target_file.write("%d,%d\t" % (p,q))
target_file.write("\n")
return target_file
def process_file(source_file,read_group,dinuc,target_file):
base = source_file.name[:source_file.name.rfind('.csv')] + '.' + read_group
regression_command = ' '.join((R_exe,logistic_regression_script,base,base,dinuc))
print "Running " + regression_command
os.system(regression_command)
parameters_filename = '.'.join((base,dinuc,'parameters'))
if not os.access(parameters_filename,os.R_OK):
exit("Unable to read output of R from file " + parameters_filename)
parameters_file = open(parameters_filename,'r')
parameters = ' '.join([line.rstrip('\n') for line in parameters_file]).split(' ')
target_file.write('\t'.join([read_group,dinuc]+parameters)+'\n')
os.remove('.'.join((base,dinuc,'csv')))
os.remove('.'.join((base,dinuc,'parameters')))
class LogisticRegressionRunner:
def __init__(self,source_filename,read_group,dinuc):
__base = source_file.name[:source_file.name.rfind('.csv')]
__read_group = read_group
__dinuc = dinuc
if len(sys.argv) < 3:
exit("Usage: logistic_regression <covariate count input csv> <parameters csv>",1)
source_file = open_source_file(sys.argv[1])
target_file = open_target_file(sys.argv[2])
header = read_header(source_file)
intermediate_file = None
read_group = None
dinuc = None
for data_line in source_file:
data_line = data_line.strip()
if len(data_line) == 0:
continue
data = data_line.split(',')
if read_group != data[0] or dinuc != data[1]:
if intermediate_file:
intermediate_file.close()
process_file(source_file,read_group,dinuc,target_file)
read_group,dinuc = data[0:2]
intermediate_file = create_intermediate_file(source_file,read_group,dinuc)
intermediate_file.write(','.join(data[2:])+'\n')
if intermediate_file:
intermediate_file.close()
process_file(source_file,read_group,dinuc,target_file)
source_file.close()
target_file.close()