Python Cookbook

The output file formats of all DNAscent executables were specifically designed to be easy to parse with short (Python, Perl, etc.) scripts with the aim of making it simple for users to make application-specific plots. Here, we provide a brief “cookbook” of barebones analysis scripts that can be copied and modified by users.

The following barebones script parses the output of DNAscent detect. We iterate line-by-line and parse each field in the file.

f = open('path/to/output.detect','r')

for line in f:

     #ignore the header lines
     if line[0] == '#':
             continue

     #split the line into a list by whitespace
     splitLine = line.rstrip().split()

     if line[0] == '>':

             readID = splitLine[0][1:]
             chromosome = splitLine[1]
             refStart = int(splitLine[2])
             refEnd = int(splitLine[3])
             strand = splitLine[4]
     else:
             posOnRef = int(splitLine[0])
             probEdU = float(splitLine[1])
             probBrdU = float(splitLine[2])
             sixMerOnRef = splitLine[3]

             #add these values to a container or do some processing here

f.close()

The following example plots a histogram of fork track lengths from bed files generated by DNAscent forkSense.

import matplotlib
matplotlib.use('Agg')
from matplotlib import pyplot as plt

fnames = ['leftForks_DNAscent_forkSense.bed','rightForks_DNAscent_forkSense.bed']

forkLengths = []

for fn in fnames:
     f = open(fn,'r')

        for line in f:

             #ignore the header lines
             if line[0] == '#':
                     continue

             #split the line into a list by whitespace
             splitLine = line.rstrip().split()

             lbound = int(splitLine[1])
             rbound = int(splitLine[2])

             forkLengths.append(rbound-lbound)

        f.close()

plt.figure()
plt.hist(forkLengths)
plt.xlabel('Fork Track Length (bp)')
plt.ylabel('Count')
plt.savefig('forkTrackLen.pdf')
plt.close()

The following example plots a histogram of fork stress scores from bed files generated by DNAscent forkSense.

import matplotlib
matplotlib.use('Agg')
from matplotlib import pyplot as plt

fnames = ['leftForks_DNAscent_forkSense.bed','rightForks_DNAscent_forkSense.bed']

forkStress = []

for fn in fnames:
     f = open(fn,'r')

        for line in f:

             #ignore the header lines
             if line[0] == '#':
                     continue

             #split the line into a list by whitespace
             splitLine = line.rstrip().split()

             score = float(splitLine[7])

             #ignore cases where a stress call was declined by only plotting non-negative scores
             if score >= 0.:
                     forkStress.append(score)

        f.close()

plt.figure()
plt.hist(forkStress)
plt.xlabel('Fork Stress Score')
plt.ylabel('Count')
plt.savefig('forkStress.pdf')
plt.close()

The following example pulls out stressed forks from the fork bed file.

stressThreshold = 0.9 #a sensible threshold for paused/stalled forks

fnames = ['leftForks_DNAscent_forkSense.bed','rightForks_DNAscent_forkSense.bed']

for fn in fnames:
     f = open(fn,'r')

     #make a new bed file to write on
     fn_split = fn.split('.')
     f_out = open(fn_split[0]+'_stressed.bed','w')

        for line in f:

             #ignore the header lines
             if line[0] == '#':
                     f_out.write(line)
                     continue

             #split the line into a list by whitespace
             splitLine = line.rstrip().split()

             score = float(splitLine[7])

             if score >= stressThreshold:
                     f_out.write(line)

        f.close()
        f_out.close()