FOR KIRAN: Adding R scripts and functions that perform theoretical calculations of phasing
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5611 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
9c36b0a39b
commit
822b4d66ce
|
|
@ -0,0 +1,54 @@
|
|||
#
|
||||
#options(warn=2)
|
||||
#options(error=recover)
|
||||
#
|
||||
|
||||
HALF = high_dimensional_integrate(1, -200, 0, dnorm)
|
||||
print(paste("Should be ~ HALF: ", HALF, sep=""))
|
||||
|
||||
|
||||
k = 75
|
||||
#theta = 10^-2
|
||||
theta = 10^-3
|
||||
|
||||
p = pHetPairLteDistance(k, theta)
|
||||
print(paste(p, " = pHetPairLteDistance(k= ", k, ", theta= ", theta, ")", sep=""))
|
||||
|
||||
|
||||
L = 76
|
||||
fragmentSize = 452
|
||||
|
||||
|
||||
p = pPairedEndReadsOfSpecificFragmentCanCoverHetPairAtDistance(L, fragmentSize, k)
|
||||
print(paste(p, " = pPairedEndReadsOfSpecificFragmentCanCoverHetPairAtDistance(L= ", L, ", fragmentSize= ", fragmentSize, ", k= ", k, ")", sep=""))
|
||||
|
||||
Fm = 392
|
||||
Fs = 44
|
||||
|
||||
p = pFragmentSize(300, Fm, Fs)
|
||||
print(paste(p, " = pFragmentSize(300, Fm= ", Fm, ", Fs= ", Fs, ")", sep=""))
|
||||
|
||||
|
||||
p = pFragmentsReadsCanCoverHetPairAtDistance(L, k, Fm, Fs)
|
||||
print(paste(p, " = pFragmentsReadsCanCoverHetPairAtDistance(L= ", L, ", k= ", k, ", Fm= ", Fm, ", Fs= ", Fs, ")", sep=""))
|
||||
|
||||
|
||||
meanDepth = 65
|
||||
nReadsToPhase = 1
|
||||
p = pDirectlyPhaseHetPairAtDistanceUsingDepth(meanDepth, nReadsToPhase, L, k, Fm, Fs)
|
||||
print(paste(p, " = pDirectlyPhaseHetPairAtDistanceUsingDepth(meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", L= ", L, ", k= ", k, ", Fm= ", Fm, ", Fs= ", Fs, ")", sep=""))
|
||||
|
||||
|
||||
p = pDirectlyPhaseHetPair(meanDepth, nReadsToPhase, L, theta, Fm, Fs)
|
||||
print(paste(p, " = pDirectlyPhaseHetPair(meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", L= ", L, ", theta= ", theta, ", Fm= ", Fm, ", Fs= ", Fs, ")", sep=""))
|
||||
|
||||
|
||||
windowDistances = c(100, 100, 100, 100, 100)
|
||||
phaseIndex = 2
|
||||
p = pPhaseHetPairAtDistanceUsingDepthAndWindow(windowDistances, phaseIndex, meanDepth, nReadsToPhase, L, Fm, Fs)
|
||||
print(paste(p, " = pPhaseHetPairAtDistanceUsingDepthAndWindow(windowDistances= (", paste(windowDistances, collapse=", "), "), phaseIndex= ", phaseIndex, ", meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", L= ", L, ", Fm= ", Fm, ", Fs= ", Fs, ")", sep=""))
|
||||
|
||||
|
||||
|
||||
traceback()
|
||||
warnings()
|
||||
|
|
@ -0,0 +1,13 @@
|
|||
theta = 10^-3
|
||||
params = paste("theta= ", theta, sep="")
|
||||
|
||||
MIN_DIST = 1
|
||||
MAX_DIST = 10^4
|
||||
BY_DIST = 10
|
||||
DISTANCES = seq(from=MIN_DIST, to=MAX_DIST+BY_DIST, by=BY_DIST)
|
||||
freqAtLteDist = pHetPairLteDistance(DISTANCES, theta)
|
||||
|
||||
scatter(DISTANCES, freqAtLteDist, "intraHetDistancesDistrib", xlab="Intra-het distance", ylab="Cumulative Frequency", log="x", main=params)
|
||||
|
||||
|
||||
save(list = ls(all=TRUE), file = "intraHetDistancesDistrib.RData")
|
||||
|
|
@ -0,0 +1,39 @@
|
|||
theta = 10^-3
|
||||
|
||||
Fm_BASE = 392 - 2 * 101 # The mean insert size == 190
|
||||
Fs = 44
|
||||
|
||||
nReadsToPhase = 1
|
||||
|
||||
params = paste("nReadsToPhase= ", nReadsToPhase, ", theta= ", theta, ", Fm_BASE= ", Fm_BASE, ", Fs= ", Fs, sep="")
|
||||
|
||||
|
||||
|
||||
MEAN_DEPTHS = 0:65
|
||||
NUM_DEPTHS = length(MEAN_DEPTHS)
|
||||
|
||||
READ_LENGTHS = c(18, 36, 76, 101, 125, 150, 175, 200, 400, 800, 1000)
|
||||
READ_LENGTHS = rev(READ_LENGTHS)
|
||||
NUM_READ_LENGTHS = length(READ_LENGTHS)
|
||||
|
||||
depthsX = list()
|
||||
depthsY = list()
|
||||
depthsLeg = vector()
|
||||
|
||||
for (i in 1:NUM_READ_LENGTHS) {
|
||||
pPhaseDepth = as.vector(matrix(data = -1, nrow = 1, ncol = NUM_DEPTHS))
|
||||
Fm = Fm_BASE + 2 * READ_LENGTHS[i]
|
||||
for (j in 1:NUM_DEPTHS) {
|
||||
pPhaseDepth[j] = pDirectlyPhaseHetPair(MEAN_DEPTHS[j], nReadsToPhase, READ_LENGTHS[i], theta, Fm, Fs)
|
||||
}
|
||||
depthsX[i] = list(MEAN_DEPTHS)
|
||||
depthsY[i] = list(pPhaseDepth)
|
||||
depthsLeg[i] = paste("L= ", READ_LENGTHS[i], sep="")
|
||||
}
|
||||
|
||||
scatter(depthsX, depthsY, "testDepths", xlab="Mean depth", ylab="Phaseability", main=params, leg=depthsLeg, legPos="topleft", width=14, height=7, type="b")
|
||||
|
||||
|
||||
|
||||
|
||||
save(list = ls(all=TRUE), file = "testDepths.RData")
|
||||
|
|
@ -0,0 +1,47 @@
|
|||
theta = 10^-3
|
||||
|
||||
L = 101
|
||||
|
||||
meanDepth = 65
|
||||
nReadsToPhase = 1
|
||||
|
||||
params = paste("meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", L= ", L, ", theta= ", theta, sep="")
|
||||
|
||||
|
||||
|
||||
MEAN_SIZES = seq(1,2000,20)
|
||||
STD_SIZES = seq(0,200,5)
|
||||
|
||||
|
||||
testFragments = matrix(nrow=length(MEAN_SIZES), ncol=length(STD_SIZES))
|
||||
for (i in 1:length(MEAN_SIZES)) {
|
||||
test_mean_fragment_size = MEAN_SIZES[i]
|
||||
print(paste("test_mean_fragment_size: ", test_mean_fragment_size, sep=""))
|
||||
for (j in 1:length(STD_SIZES)) {
|
||||
test_std_fragment_size = STD_SIZES[j]
|
||||
print(paste("test_std_fragment_size: ", test_std_fragment_size, sep=""))
|
||||
|
||||
testFragments[i,j] = pDirectlyPhaseHetPair(meanDepth, nReadsToPhase, L, theta, test_mean_fragment_size, test_std_fragment_size)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pdf('testFragments.pdf')
|
||||
|
||||
library(gplots)
|
||||
heatmap.2(testFragments, ylab = "Mean fragment size", xlab = "Standard deviation fragment size", labRow = MEAN_SIZES, labCol = STD_SIZES, Rowv = NA, Colv = NA, dendrogram = "none", scale="none", revC = FALSE, density.info="none", trace="none", main=params)
|
||||
|
||||
library(scatterplot3d)
|
||||
xMeans = as.vector(t(matrix(rep.int(MEAN_SIZES, length(STD_SIZES)), ncol = length(STD_SIZES))))
|
||||
yStds = rep.int(STD_SIZES, length(MEAN_SIZES))
|
||||
zPhaseRate = as.vector(t(testFragments))
|
||||
scatterplot3d(xMeans, yStds, zPhaseRate, xlab = "Mean fragment size", ylab = "Standard deviation fragment size", zlab = "Phasing rate", main=params)
|
||||
|
||||
bestCombo = which.max(zPhaseRate)
|
||||
print(paste("For ", params, ", BEST choice gives phaseability of ", zPhaseRate[bestCombo], " using mean fragment = ", xMeans[bestCombo], ", std. fragment = ", yStds[bestCombo], sep = ""))
|
||||
dev.off()
|
||||
|
||||
|
||||
|
||||
|
||||
save(list = ls(all=TRUE), file = "testFragments.RData")
|
||||
|
|
@ -0,0 +1,25 @@
|
|||
L = 101
|
||||
|
||||
Fm = 392
|
||||
Fs = 44
|
||||
|
||||
meanDepth = 65
|
||||
nReadsToPhase = 1
|
||||
|
||||
params = paste("meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", L= ", L, ", Fm= ", Fm, ", Fs= ", Fs, sep="")
|
||||
|
||||
|
||||
|
||||
MEAN_INTRA_HET_DISTANCES = seq(from=2, to=20002, by=50)
|
||||
THETAS = meanIntraHetDistanceToTheta(MEAN_INTRA_HET_DISTANCES)
|
||||
NUM_THETAS = length(THETAS)
|
||||
|
||||
pPhaseTheta = as.vector(matrix(data = -1, nrow = 1, ncol = NUM_THETAS))
|
||||
for (i in 1:NUM_THETAS) {
|
||||
pPhaseTheta[i] = pDirectlyPhaseHetPair(meanDepth, nReadsToPhase, L, THETAS[i], Fm, Fs)
|
||||
}
|
||||
scatter(MEAN_INTRA_HET_DISTANCES, pPhaseTheta, "testIntraHetDistances", xlab="Mean intra-het distance", ylab="Phaseability", main=params, type="b")
|
||||
|
||||
|
||||
|
||||
save(list = ls(all=TRUE), file = "testIntraHetDistances.RData")
|
||||
|
|
@ -0,0 +1,24 @@
|
|||
theta = 10^-3
|
||||
|
||||
Fm_BASE = 392 - 2 * 101 # The mean insert size == 190
|
||||
Fs = 44
|
||||
|
||||
meanDepth = 65
|
||||
nReadsToPhase = 1
|
||||
|
||||
params = paste("meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", theta= ", theta, ", Fm_BASE= ", Fm_BASE, ", Fs= ", Fs, sep="")
|
||||
|
||||
|
||||
READ_LENGTHS = seq(from=30, to=1000, by=10)
|
||||
NUM_READ_LENGTHS = length(READ_LENGTHS)
|
||||
|
||||
pPhaseReadLength = as.vector(matrix(data = -1, nrow = 1, ncol = NUM_READ_LENGTHS))
|
||||
for (i in 1:NUM_READ_LENGTHS) {
|
||||
Fm = Fm_BASE + 2 * READ_LENGTHS[i]
|
||||
pPhaseReadLength[i] = pDirectlyPhaseHetPair(meanDepth, nReadsToPhase, READ_LENGTHS[i], theta, Fm, Fs)
|
||||
}
|
||||
scatter(READ_LENGTHS, pPhaseReadLength, "testReadLengths", xlab="Read length", ylab="Phaseability", main=params, type="b")
|
||||
|
||||
|
||||
|
||||
save(list = ls(all=TRUE), file = "testReadLengths.RData")
|
||||
|
|
@ -0,0 +1,19 @@
|
|||
L = 101
|
||||
|
||||
Fm = 392
|
||||
Fs = 44
|
||||
|
||||
meanDepth = 65
|
||||
nReadsToPhase = 1
|
||||
|
||||
params = paste("meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", L= ", L, ", Fm= ", Fm, ", Fs= ", Fs, sep="")
|
||||
|
||||
|
||||
DISTANCES = 0:1000
|
||||
pPhaseHetPairAtDistWithRead = pDirectlyPhaseHetPairAtDistanceUsingDepth(meanDepth, nReadsToPhase, L, DISTANCES, Fm, Fs)
|
||||
|
||||
scatter(DISTANCES, pPhaseHetPairAtDistWithRead, "testSpecificDistances", xlab="Intra-het distance", ylab="Phaseability", main=params)
|
||||
|
||||
|
||||
|
||||
save(list = ls(all=TRUE), file = "testSpecificDistances.RData")
|
||||
|
|
@ -0,0 +1,8 @@
|
|||
L = 76
|
||||
k = 75
|
||||
params = paste("L= ", L, ", k= ", k, sep="")
|
||||
|
||||
FRAGMENT_SIZES = 0:100 + 2 * L
|
||||
pCoverHetPairWithRead = pPairedEndReadsOfSpecificFragmentCanCoverHetPairAtDistance(L, FRAGMENT_SIZES, k)
|
||||
|
||||
scatter(FRAGMENT_SIZES, pCoverHetPairWithRead, "testSpecificFragments", xlab="Fragment size", ylab="Probability of covering het pair", main=params)
|
||||
|
|
@ -0,0 +1,32 @@
|
|||
theta = 10^-3
|
||||
|
||||
Fm = 392
|
||||
Fs = 44
|
||||
|
||||
L = 101
|
||||
|
||||
meanDepth = 65
|
||||
nReadsToPhase = 1
|
||||
|
||||
params = paste("meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", L= ", L, ", theta= ", theta, ", Fm= ", Fm, ", Fs= ", Fs, sep="")
|
||||
|
||||
#
|
||||
#options(warn=2)
|
||||
#options(error=recover)
|
||||
#
|
||||
|
||||
MAX_WINDOW_SIZE = 10
|
||||
|
||||
WINDOW_SIZES = 2:MAX_WINDOW_SIZE
|
||||
NUM_WINDOW_SIZES = length(WINDOW_SIZES)
|
||||
|
||||
pPhaseWindow = as.vector(matrix(data = -1, nrow = 1, ncol = NUM_WINDOW_SIZES))
|
||||
for (i in 1:NUM_WINDOW_SIZES) {
|
||||
n = WINDOW_SIZES[i]
|
||||
phaseIndex = middleOfWindowIndex(n)
|
||||
pPhaseWindow[i] = pDirectlyPhaseHetPairUsingWindow(meanDepth, nReadsToPhase, L, theta, Fm, Fs, n, phaseIndex)
|
||||
|
||||
save(list = ls(all=TRUE), file = "testWindows.RData")
|
||||
}
|
||||
|
||||
scatter(WINDOW_SIZES, pPhaseWindow, "testWindows", xlab="Window size", ylab="Phaseability", main=params, type="b")
|
||||
|
|
@ -0,0 +1,28 @@
|
|||
L = 101
|
||||
|
||||
Fm = 392
|
||||
Fs = 44
|
||||
|
||||
meanDepth = 65
|
||||
nReadsToPhase = 1
|
||||
|
||||
theta = 10^-3
|
||||
|
||||
params = paste("meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", theta= ", theta, ", L= ", L, ", Fm= ", Fm, ", Fs= ", Fs, sep="")
|
||||
|
||||
|
||||
MAX_NUM_DISTS = 10^4
|
||||
distances = sampleIntraHetDistances(MAX_NUM_DISTS, theta)
|
||||
print(paste("Using ", MAX_NUM_DISTS, " THEORETICAL distances...", sep=""))
|
||||
|
||||
|
||||
MAX_WINDOW_SIZE = 10
|
||||
FILE_NAME = "theoretical_window"
|
||||
|
||||
phaseWindowResult = calcPhasingProbsForWindowDistances(distances, MAX_WINDOW_SIZE, meanDepth, nReadsToPhase, L, Fm, Fs, FILE_NAME)
|
||||
phaseProbsPositionWindow = phaseWindowResult$phaseProbsPositionWindow
|
||||
WINDOW_SIZES = phaseWindowResult$WINDOW_SIZES
|
||||
|
||||
phaseProbsWindow = colMeans(phaseProbsPositionWindow)
|
||||
|
||||
scatter(WINDOW_SIZES, phaseProbsWindow, FILE_NAME, xlab="Window size", ylab="Mean theoretical phasing rate on empirical distances", main=params, type="b")
|
||||
|
|
@ -0,0 +1,30 @@
|
|||
L = 101
|
||||
|
||||
Fm = 392
|
||||
Fs = 44
|
||||
|
||||
meanDepth = 65
|
||||
nReadsToPhase = 1
|
||||
|
||||
params = paste("meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", L= ", L, ", Fm= ", Fm, ", Fs= ", Fs, sep="")
|
||||
|
||||
|
||||
distances = scan("~fromer/storage/phase.NA12878/COMPLETE_LIST.het_distances.txt", what=list(dist=0))
|
||||
distances = distances$dist
|
||||
|
||||
MAX_NUM_DISTS = 10^4
|
||||
NUM_DISTS_TO_USE = min(MAX_NUM_DISTS, length(distances))
|
||||
distances = distances[1:NUM_DISTS_TO_USE]
|
||||
print(paste("Using ", NUM_DISTS_TO_USE, " EMPIRICAL distances...", sep=""))
|
||||
|
||||
|
||||
MAX_WINDOW_SIZE = 10
|
||||
FILE_NAME = "theoretical_window_on_empirical"
|
||||
|
||||
phaseWindowResult = calcPhasingProbsForWindowDistances(distances, MAX_WINDOW_SIZE, meanDepth, nReadsToPhase, L, Fm, Fs, FILE_NAME)
|
||||
phaseProbsPositionWindow = phaseWindowResult$phaseProbsPositionWindow
|
||||
WINDOW_SIZES = phaseWindowResult$WINDOW_SIZES
|
||||
|
||||
phaseProbsWindow = colMeans(phaseProbsPositionWindow)
|
||||
|
||||
scatter(WINDOW_SIZES, phaseProbsWindow, FILE_NAME, xlab="Window size", ylab="Mean theoretical phasing rate on empirical distances", main=params, type="b")
|
||||
|
|
@ -0,0 +1,181 @@
|
|||
NUM_chr1_HET_SITES = as.integer(system("grep -c 'chr1:' ~fromer/storage/phase.NA12878/COMPLETE_LIST.het_sites.interval_list", intern=TRUE))
|
||||
NUM_chr1_PHASEABLE_HET_SITES = NUM_chr1_HET_SITES - 1 # since can't phase the first het site
|
||||
|
||||
|
||||
#
|
||||
#USE_EMPIRICAL_WINDOWS = c(10, 2)
|
||||
#
|
||||
USE_EMPIRICAL_WINDOWS = c(2)
|
||||
|
||||
|
||||
TWO_COLORS = c("red", "darkgreen")
|
||||
|
||||
|
||||
######################################################################
|
||||
# Phasing as a function of SPECIFIC intra-het distances:
|
||||
######################################################################
|
||||
load("testSpecificDistances.RData")
|
||||
|
||||
MAX_DISTANCE = 10^3
|
||||
PQ_PHASING_THRESH = 10.0
|
||||
|
||||
distances = list()
|
||||
phaseRateDistances = list()
|
||||
distancesLeg = vector()
|
||||
|
||||
for (nextIndex in 1:length(USE_EMPIRICAL_WINDOWS)) {
|
||||
n = USE_EMPIRICAL_WINDOWS[nextIndex]
|
||||
n_locDistancePQReadsWindow <- scan(paste("~fromer/storage/phase.NA12878/phase_all_chr.n_", n, ".NA12878", ".locus_distance_PQ_numReads_windowSize.txt", sep=""), what=list(loci="", distance=0, PQ=0, reads=0, window=0))
|
||||
n_distance <- n_locDistancePQReadsWindow$distance
|
||||
n_PQ <- n_locDistancePQReadsWindow$PQ
|
||||
|
||||
distanceVector = sort(unique(n_distance))
|
||||
distanceVector = distanceVector[which(distanceVector <= MAX_DISTANCE)]
|
||||
numDists = length(distanceVector)
|
||||
|
||||
phasedFractionVector = as.vector(matrix(data=-1, nrow=1, ncol=numDists))
|
||||
|
||||
print(paste("numDists= ", numDists, sep=""))
|
||||
print(paste(distanceVector, collapse=", "))
|
||||
|
||||
for (i in 1:numDists) {
|
||||
d = distanceVector[i]
|
||||
print(paste("d= ", d, sep=""))
|
||||
|
||||
dInds = which(n_distance == d)
|
||||
phasedFractionVector[i] = length(which(n_PQ[dInds] >= PQ_PHASING_THRESH)) / length(dInds)
|
||||
}
|
||||
|
||||
distances[nextIndex] = list(distanceVector)
|
||||
phaseRateDistances[nextIndex] = list(phasedFractionVector)
|
||||
distancesLeg[nextIndex] = paste("HiSeq (window = ", n, ")", sep="")
|
||||
}
|
||||
|
||||
nextIndex = nextIndex+1
|
||||
distances[nextIndex] = list(DISTANCES)
|
||||
phaseRateDistances[nextIndex] = list(pPhaseHetPairAtDistWithRead)
|
||||
distancesLeg[nextIndex] = "Theoretical (window = 2)" # params
|
||||
|
||||
scatter(distances, phaseRateDistances, "specific_distances.theoretical_empirical", xlab="Intra-het distance", ylab="Phaseability", leg=distancesLeg, legPos="topright", width=14, height=7, type="b", col=TWO_COLORS)
|
||||
|
||||
|
||||
|
||||
######################################################################
|
||||
# Phasing as a function of depth:
|
||||
######################################################################
|
||||
load("testDepths.RData")
|
||||
|
||||
depths = list()
|
||||
phaseRateDepths = list()
|
||||
depthsLeg = vector()
|
||||
|
||||
for (nextIndex in 1:length(USE_EMPIRICAL_WINDOWS)) {
|
||||
n = USE_EMPIRICAL_WINDOWS[nextIndex]
|
||||
RGdocPhasedConsistentSwitch = scan(paste("~fromer/storage/downsampled_phasing.NA12878.HiSeq/RG.DoC_phased_consistent_switch.chr1.n_", n, ".txt", sep=""), what=list(RGdoc=0, phased=0, consistentPhased=0, switch=0.0))
|
||||
depths[nextIndex] = list(RGdocPhasedConsistentSwitch$RGdoc)
|
||||
phaseRateDepths[nextIndex] = list(RGdocPhasedConsistentSwitch$phased / NUM_chr1_PHASEABLE_HET_SITES)
|
||||
depthsLeg[nextIndex] = paste("Down-sampled HiSeq (window = ", n, ")", sep="")
|
||||
}
|
||||
|
||||
nextIndex = nextIndex+1
|
||||
useLength = which(READ_LENGTHS == 101)
|
||||
depths[nextIndex] = depthsX[useLength]
|
||||
phaseRateDepths[nextIndex] = depthsY[useLength]
|
||||
depthsLeg[nextIndex] = "Theoretical (window = 2)" # params
|
||||
|
||||
scatter(depths, phaseRateDepths, "depths.theoretical_empirical", xlab="Mean depth", ylab="Phaseability", leg=depthsLeg, legPos="topleft", width=14, height=7, type="b", col=TWO_COLORS)
|
||||
|
||||
|
||||
|
||||
######################################################################
|
||||
# Distribution of intra-het distances:
|
||||
######################################################################
|
||||
load("intraHetDistancesDistrib.RData")
|
||||
|
||||
empiricalIntraHetDistances = read.table("~fromer/storage/phase.NA12878/COMPLETE_LIST.het_distances.txt")$V1
|
||||
empiricalIntraHetDistances[which(empiricalIntraHetDistances >= MAX_DIST)] = MAX_DIST
|
||||
|
||||
empiricalIntraHetDistancesHist = hist(empiricalIntraHetDistances, breaks=DISTANCES, plot=FALSE)
|
||||
empiricalIntraHetDistancesCumulativeFrequencies = cumsum(empiricalIntraHetDistancesHist$counts) / length(empiricalIntraHetDistances)
|
||||
|
||||
scatter(list(empiricalIntraHetDistancesHist$mids, DISTANCES), list(empiricalIntraHetDistancesCumulativeFrequencies, freqAtLteDist), "intraHetDistancesDistrib.theoretical_empirical", xlab="Intra-het distance", ylab="Cumulative Frequency", log="x", leg=c("NA12878 HiSeq", "Theoretical"), legPos="topleft", type="b", col=TWO_COLORS)
|
||||
|
||||
|
||||
|
||||
######################################################################
|
||||
# Phasing as a function of MEAN intra-het distance:
|
||||
######################################################################
|
||||
load("testIntraHetDistances.RData")
|
||||
|
||||
hetDistances = list()
|
||||
phaseRateHetDistances = list()
|
||||
hetDistancesLeg = vector()
|
||||
|
||||
for (nextIndex in 1:length(USE_EMPIRICAL_WINDOWS)) {
|
||||
n = USE_EMPIRICAL_WINDOWS[nextIndex]
|
||||
meanHetDistNumSitesPhasedConsistentSwitch = scan(paste("~fromer/storage/remove_het_sites.NA12878.HiSeq/meanHetDist_numSites_phased_consistent_switch.chr1.n_", n, ".txt", sep=""), what=list(meanHetDist=0.0, numSites=0, phased=0, consistentPhased=0, switch=0.0))
|
||||
|
||||
hetDistances[nextIndex] = list(meanHetDistNumSitesPhasedConsistentSwitch$meanHetDist)
|
||||
phaseRateHetDistances[nextIndex] = list(meanHetDistNumSitesPhasedConsistentSwitch$phased)
|
||||
hetDistancesLeg[nextIndex] = paste("Removed hets from HiSeq (window = ", n, ")", sep="")
|
||||
}
|
||||
|
||||
nextIndex = nextIndex+1
|
||||
hetDistances[nextIndex] = list(MEAN_INTRA_HET_DISTANCES)
|
||||
phaseRateHetDistances[nextIndex] = list(pPhaseTheta)
|
||||
hetDistancesLeg[nextIndex] = "Theoretical (window = 2)" # params
|
||||
|
||||
scatter(hetDistances, phaseRateHetDistances, "intraHetDistances.theoretical_empirical", xlab="Mean intra-het distance", ylab="Phaseability", leg=hetDistancesLeg, legPos="topright", type="b", col=TWO_COLORS)
|
||||
|
||||
scatter(hetDistances, phaseRateHetDistances, "intraHetDistances.log.theoretical_empirical", xlab="Mean intra-het distance", ylab="Phaseability", leg=hetDistancesLeg, legPos="topright", type="b", col=TWO_COLORS, log="y", xlim=c(1, 20000))
|
||||
|
||||
|
||||
######################################################################
|
||||
# Phasing as a function of window size:
|
||||
######################################################################
|
||||
load("theoretical_window_on_empirical.RData")
|
||||
|
||||
windows = list()
|
||||
phaseRateWindows = list()
|
||||
windowsLeg = vector()
|
||||
|
||||
NUM_HET_SITES = as.integer(system("cat ~fromer/storage/phase.NA12878/COMPLETE_LIST.het_sites.interval_list | wc -l", intern=TRUE))
|
||||
NUM_CHR = as.integer(system("cat ~fromer/storage/phase.NA12878/COMPLETE_LIST.het_sites.interval_list | cut -f1 -d':' | sort | uniq | wc -l", intern=TRUE))
|
||||
NUM_PHASEABLE_HET_SITES = NUM_HET_SITES - NUM_CHR # since can't phase the first het site of each chromosome
|
||||
|
||||
|
||||
windowPhasedConsistent = scan(paste("~fromer/storage/phase.NA12878/window_phased_consistent.txt", sep=""), what=list(window=0, phased=0, consistentPhased=0))
|
||||
windows[1] = list(windowPhasedConsistent$window)
|
||||
phaseRateWindows[1] = list(windowPhasedConsistent$phased / NUM_PHASEABLE_HET_SITES)
|
||||
windowsLeg[1] = paste("HiSeq", sep="")
|
||||
|
||||
|
||||
windows[2] = list(WINDOW_SIZES)
|
||||
phaseRateWindows[2] = list(colMeans(na.omit(phaseProbsPositionWindow)))
|
||||
windowsLeg[2] = "Theoretical" # params
|
||||
|
||||
scatter(windows, phaseRateWindows, "windows.theoretical_empirical", xlab="Window size", ylab="Phaseability", leg=windowsLeg, legPos="topleft", width=14, height=7, type="b", col=TWO_COLORS)
|
||||
|
||||
|
||||
|
||||
# Use numerical integration over theoretical distances distribution:
|
||||
load("testWindows.RData")
|
||||
|
||||
doneInds = which(pPhaseWindow != -1)
|
||||
|
||||
windows[2] = list(WINDOW_SIZES[doneInds])
|
||||
phaseRateWindows[2] = list(pPhaseWindow[doneInds])
|
||||
windowsLeg[2] = "Theoretical" # params
|
||||
|
||||
scatter(windows, phaseRateWindows, "theoretical_distances.windows.theoretical_empirical", xlab="Window size", ylab="Phaseability", leg=windowsLeg, legPos="topleft", width=14, height=7, type="b", col=TWO_COLORS)
|
||||
|
||||
|
||||
|
||||
# Use theoretical sampling of distances:
|
||||
load("theoretical_window.RData")
|
||||
|
||||
windows[2] = list(WINDOW_SIZES)
|
||||
phaseRateWindows[2] = list(colMeans(na.omit(phaseProbsPositionWindow)))
|
||||
windowsLeg[2] = "Theoretical" # params
|
||||
|
||||
scatter(windows, phaseRateWindows, "sampled_theoretical_distances.windows.theoretical_empirical", xlab="Window size", ylab="Phaseability", leg=windowsLeg, legPos="topleft", width=14, height=7, type="b", col=TWO_COLORS)
|
||||
Loading…
Reference in New Issue