182 lines
8.1 KiB
R
182 lines
8.1 KiB
R
|
|
NUM_chr1_HET_SITES = as.integer(system("grep -c 'chr1:' ~fromer/storage/phase.NA12878/COMPLETE_LIST.het_sites.interval_list", intern=TRUE))
|
||
|
|
NUM_chr1_PHASEABLE_HET_SITES = NUM_chr1_HET_SITES - 1 # since can't phase the first het site
|
||
|
|
|
||
|
|
|
||
|
|
#
|
||
|
|
#USE_EMPIRICAL_WINDOWS = c(10, 2)
|
||
|
|
#
|
||
|
|
USE_EMPIRICAL_WINDOWS = c(2)
|
||
|
|
|
||
|
|
|
||
|
|
TWO_COLORS = c("red", "darkgreen")
|
||
|
|
|
||
|
|
|
||
|
|
######################################################################
|
||
|
|
# Phasing as a function of SPECIFIC intra-het distances:
|
||
|
|
######################################################################
|
||
|
|
load("testSpecificDistances.RData")
|
||
|
|
|
||
|
|
MAX_DISTANCE = 10^3
|
||
|
|
PQ_PHASING_THRESH = 10.0
|
||
|
|
|
||
|
|
distances = list()
|
||
|
|
phaseRateDistances = list()
|
||
|
|
distancesLeg = vector()
|
||
|
|
|
||
|
|
for (nextIndex in 1:length(USE_EMPIRICAL_WINDOWS)) {
|
||
|
|
n = USE_EMPIRICAL_WINDOWS[nextIndex]
|
||
|
|
n_locDistancePQReadsWindow <- scan(paste("~fromer/storage/phase.NA12878/phase_all_chr.n_", n, ".NA12878", ".locus_distance_PQ_numReads_windowSize.txt", sep=""), what=list(loci="", distance=0, PQ=0, reads=0, window=0))
|
||
|
|
n_distance <- n_locDistancePQReadsWindow$distance
|
||
|
|
n_PQ <- n_locDistancePQReadsWindow$PQ
|
||
|
|
|
||
|
|
distanceVector = sort(unique(n_distance))
|
||
|
|
distanceVector = distanceVector[which(distanceVector <= MAX_DISTANCE)]
|
||
|
|
numDists = length(distanceVector)
|
||
|
|
|
||
|
|
phasedFractionVector = as.vector(matrix(data=-1, nrow=1, ncol=numDists))
|
||
|
|
|
||
|
|
print(paste("numDists= ", numDists, sep=""))
|
||
|
|
print(paste(distanceVector, collapse=", "))
|
||
|
|
|
||
|
|
for (i in 1:numDists) {
|
||
|
|
d = distanceVector[i]
|
||
|
|
print(paste("d= ", d, sep=""))
|
||
|
|
|
||
|
|
dInds = which(n_distance == d)
|
||
|
|
phasedFractionVector[i] = length(which(n_PQ[dInds] >= PQ_PHASING_THRESH)) / length(dInds)
|
||
|
|
}
|
||
|
|
|
||
|
|
distances[nextIndex] = list(distanceVector)
|
||
|
|
phaseRateDistances[nextIndex] = list(phasedFractionVector)
|
||
|
|
distancesLeg[nextIndex] = paste("HiSeq (window = ", n, ")", sep="")
|
||
|
|
}
|
||
|
|
|
||
|
|
nextIndex = nextIndex+1
|
||
|
|
distances[nextIndex] = list(DISTANCES)
|
||
|
|
phaseRateDistances[nextIndex] = list(pPhaseHetPairAtDistWithRead)
|
||
|
|
distancesLeg[nextIndex] = "Theoretical (window = 2)" # params
|
||
|
|
|
||
|
|
scatter(distances, phaseRateDistances, "specific_distances.theoretical_empirical", xlab="Intra-het distance", ylab="Phaseability", leg=distancesLeg, legPos="topright", width=14, height=7, type="b", col=TWO_COLORS)
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
######################################################################
|
||
|
|
# Phasing as a function of depth:
|
||
|
|
######################################################################
|
||
|
|
load("testDepths.RData")
|
||
|
|
|
||
|
|
depths = list()
|
||
|
|
phaseRateDepths = list()
|
||
|
|
depthsLeg = vector()
|
||
|
|
|
||
|
|
for (nextIndex in 1:length(USE_EMPIRICAL_WINDOWS)) {
|
||
|
|
n = USE_EMPIRICAL_WINDOWS[nextIndex]
|
||
|
|
RGdocPhasedConsistentSwitch = scan(paste("~fromer/storage/downsampled_phasing.NA12878.HiSeq/RG.DoC_phased_consistent_switch.chr1.n_", n, ".txt", sep=""), what=list(RGdoc=0, phased=0, consistentPhased=0, switch=0.0))
|
||
|
|
depths[nextIndex] = list(RGdocPhasedConsistentSwitch$RGdoc)
|
||
|
|
phaseRateDepths[nextIndex] = list(RGdocPhasedConsistentSwitch$phased / NUM_chr1_PHASEABLE_HET_SITES)
|
||
|
|
depthsLeg[nextIndex] = paste("Down-sampled HiSeq (window = ", n, ")", sep="")
|
||
|
|
}
|
||
|
|
|
||
|
|
nextIndex = nextIndex+1
|
||
|
|
useLength = which(READ_LENGTHS == 101)
|
||
|
|
depths[nextIndex] = depthsX[useLength]
|
||
|
|
phaseRateDepths[nextIndex] = depthsY[useLength]
|
||
|
|
depthsLeg[nextIndex] = "Theoretical (window = 2)" # params
|
||
|
|
|
||
|
|
scatter(depths, phaseRateDepths, "depths.theoretical_empirical", xlab="Mean depth", ylab="Phaseability", leg=depthsLeg, legPos="topleft", width=14, height=7, type="b", col=TWO_COLORS)
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
######################################################################
|
||
|
|
# Distribution of intra-het distances:
|
||
|
|
######################################################################
|
||
|
|
load("intraHetDistancesDistrib.RData")
|
||
|
|
|
||
|
|
empiricalIntraHetDistances = read.table("~fromer/storage/phase.NA12878/COMPLETE_LIST.het_distances.txt")$V1
|
||
|
|
empiricalIntraHetDistances[which(empiricalIntraHetDistances >= MAX_DIST)] = MAX_DIST
|
||
|
|
|
||
|
|
empiricalIntraHetDistancesHist = hist(empiricalIntraHetDistances, breaks=DISTANCES, plot=FALSE)
|
||
|
|
empiricalIntraHetDistancesCumulativeFrequencies = cumsum(empiricalIntraHetDistancesHist$counts) / length(empiricalIntraHetDistances)
|
||
|
|
|
||
|
|
scatter(list(empiricalIntraHetDistancesHist$mids, DISTANCES), list(empiricalIntraHetDistancesCumulativeFrequencies, freqAtLteDist), "intraHetDistancesDistrib.theoretical_empirical", xlab="Intra-het distance", ylab="Cumulative Frequency", log="x", leg=c("NA12878 HiSeq", "Theoretical"), legPos="topleft", type="b", col=TWO_COLORS)
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
######################################################################
|
||
|
|
# Phasing as a function of MEAN intra-het distance:
|
||
|
|
######################################################################
|
||
|
|
load("testIntraHetDistances.RData")
|
||
|
|
|
||
|
|
hetDistances = list()
|
||
|
|
phaseRateHetDistances = list()
|
||
|
|
hetDistancesLeg = vector()
|
||
|
|
|
||
|
|
for (nextIndex in 1:length(USE_EMPIRICAL_WINDOWS)) {
|
||
|
|
n = USE_EMPIRICAL_WINDOWS[nextIndex]
|
||
|
|
meanHetDistNumSitesPhasedConsistentSwitch = scan(paste("~fromer/storage/remove_het_sites.NA12878.HiSeq/meanHetDist_numSites_phased_consistent_switch.chr1.n_", n, ".txt", sep=""), what=list(meanHetDist=0.0, numSites=0, phased=0, consistentPhased=0, switch=0.0))
|
||
|
|
|
||
|
|
hetDistances[nextIndex] = list(meanHetDistNumSitesPhasedConsistentSwitch$meanHetDist)
|
||
|
|
phaseRateHetDistances[nextIndex] = list(meanHetDistNumSitesPhasedConsistentSwitch$phased)
|
||
|
|
hetDistancesLeg[nextIndex] = paste("Removed hets from HiSeq (window = ", n, ")", sep="")
|
||
|
|
}
|
||
|
|
|
||
|
|
nextIndex = nextIndex+1
|
||
|
|
hetDistances[nextIndex] = list(MEAN_INTRA_HET_DISTANCES)
|
||
|
|
phaseRateHetDistances[nextIndex] = list(pPhaseTheta)
|
||
|
|
hetDistancesLeg[nextIndex] = "Theoretical (window = 2)" # params
|
||
|
|
|
||
|
|
scatter(hetDistances, phaseRateHetDistances, "intraHetDistances.theoretical_empirical", xlab="Mean intra-het distance", ylab="Phaseability", leg=hetDistancesLeg, legPos="topright", type="b", col=TWO_COLORS)
|
||
|
|
|
||
|
|
scatter(hetDistances, phaseRateHetDistances, "intraHetDistances.log.theoretical_empirical", xlab="Mean intra-het distance", ylab="Phaseability", leg=hetDistancesLeg, legPos="topright", type="b", col=TWO_COLORS, log="y", xlim=c(1, 20000))
|
||
|
|
|
||
|
|
|
||
|
|
######################################################################
|
||
|
|
# Phasing as a function of window size:
|
||
|
|
######################################################################
|
||
|
|
load("theoretical_window_on_empirical.RData")
|
||
|
|
|
||
|
|
windows = list()
|
||
|
|
phaseRateWindows = list()
|
||
|
|
windowsLeg = vector()
|
||
|
|
|
||
|
|
NUM_HET_SITES = as.integer(system("cat ~fromer/storage/phase.NA12878/COMPLETE_LIST.het_sites.interval_list | wc -l", intern=TRUE))
|
||
|
|
NUM_CHR = as.integer(system("cat ~fromer/storage/phase.NA12878/COMPLETE_LIST.het_sites.interval_list | cut -f1 -d':' | sort | uniq | wc -l", intern=TRUE))
|
||
|
|
NUM_PHASEABLE_HET_SITES = NUM_HET_SITES - NUM_CHR # since can't phase the first het site of each chromosome
|
||
|
|
|
||
|
|
|
||
|
|
windowPhasedConsistent = scan(paste("~fromer/storage/phase.NA12878/window_phased_consistent.txt", sep=""), what=list(window=0, phased=0, consistentPhased=0))
|
||
|
|
windows[1] = list(windowPhasedConsistent$window)
|
||
|
|
phaseRateWindows[1] = list(windowPhasedConsistent$phased / NUM_PHASEABLE_HET_SITES)
|
||
|
|
windowsLeg[1] = paste("HiSeq", sep="")
|
||
|
|
|
||
|
|
|
||
|
|
windows[2] = list(WINDOW_SIZES)
|
||
|
|
phaseRateWindows[2] = list(colMeans(na.omit(phaseProbsPositionWindow)))
|
||
|
|
windowsLeg[2] = "Theoretical" # params
|
||
|
|
|
||
|
|
scatter(windows, phaseRateWindows, "windows.theoretical_empirical", xlab="Window size", ylab="Phaseability", leg=windowsLeg, legPos="topleft", width=14, height=7, type="b", col=TWO_COLORS)
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
# Use numerical integration over theoretical distances distribution:
|
||
|
|
load("testWindows.RData")
|
||
|
|
|
||
|
|
doneInds = which(pPhaseWindow != -1)
|
||
|
|
|
||
|
|
windows[2] = list(WINDOW_SIZES[doneInds])
|
||
|
|
phaseRateWindows[2] = list(pPhaseWindow[doneInds])
|
||
|
|
windowsLeg[2] = "Theoretical" # params
|
||
|
|
|
||
|
|
scatter(windows, phaseRateWindows, "theoretical_distances.windows.theoretical_empirical", xlab="Window size", ylab="Phaseability", leg=windowsLeg, legPos="topleft", width=14, height=7, type="b", col=TWO_COLORS)
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
# Use theoretical sampling of distances:
|
||
|
|
load("theoretical_window.RData")
|
||
|
|
|
||
|
|
windows[2] = list(WINDOW_SIZES)
|
||
|
|
phaseRateWindows[2] = list(colMeans(na.omit(phaseProbsPositionWindow)))
|
||
|
|
windowsLeg[2] = "Theoretical" # params
|
||
|
|
|
||
|
|
scatter(windows, phaseRateWindows, "sampled_theoretical_distances.windows.theoretical_empirical", xlab="Window size", ylab="Phaseability", leg=windowsLeg, legPos="topleft", width=14, height=7, type="b", col=TWO_COLORS)
|