Python Cookbook

The output file formats of all DNAscent executables were specifically designed to be easy to parse with short (Python, Perl, etc.) scripts with the aim of making it simple for users to make application-specific plots. Here, we provide a brief “cookbook” of barebones analysis scripts that can be copied and modified by users.

The following barebones script parses the output of DNAscent detect. We iterate line-by-line and parse each field in the file.

f = open('path/to/output.detect','r')

for line in f:

     #ignore the header lines
     if line[0] == '#':
             continue

     #split the line into a list by whitespace
     splitLine = line.rstrip().split()

     if line[0] == '>':

             readID = splitLine[0][1:]
             chromosome = splitLine[1]
             refStart = int(splitLine[2])
             refEnd = int(splitLine[3])
             strand = splitLine[4]
     else:
             posOnRef = int(splitLine[0])
             probEdU = float(splitLine[1])
             probBrdU = float(splitLine[2])
             kmerOnRef = splitLine[3]

             #add these values to a container or do some processing here

f.close()

The following example plots a histogram of fork track lengths from bed files generated by DNAscent forkSense.

import matplotlib
matplotlib.use('Agg')
from matplotlib import pyplot as plt

fnames = ['leftForks_DNAscent_forkSense.bed','rightForks_DNAscent_forkSense.bed']

forkLengths = []

for fn in fnames:
     f = open(fn,'r')

        for line in f:

             #ignore the header lines
             if line[0] == '#':
                     continue

             #split the line into a list by whitespace
             splitLine = line.rstrip().split()

             spanOnQuery = int(splitLine[7])

             forkLengths.append(spanOnQuery)

        f.close()

plt.figure()
plt.hist(forkLengths)
plt.xlabel('Fork Track Length (bp)')
plt.ylabel('Count')
plt.savefig('forkTrackLen.pdf')
plt.close()

The following example plots a histogram of fork stress scores from bed files generated by DNAscent forkSense.

import matplotlib
matplotlib.use('Agg')
from matplotlib import pyplot as plt

fnames = ['leftForks_DNAscent_forkSense.bed','rightForks_DNAscent_forkSense.bed']

forkStress = []

for fn in fnames:
     f = open(fn,'r')

        for line in f:

             #ignore the header lines
             if line[0] == '#':
                     continue

             #split the line into a list by whitespace
             splitLine = line.rstrip().split()

             score = float(splitLine[8])

             #ignore cases where a stress call was declined by only plotting non-negative scores
             if score >= 0.:
                     forkStress.append(score)

        f.close()

plt.figure()
plt.hist(forkStress)
plt.xlabel('Fork Stress Score')
plt.ylabel('Count')
plt.savefig('forkStress.pdf')
plt.close()

The following example pulls out stressed forks from the fork bed file.

stressThreshold = 0.9 #a sensible threshold for paused/stalled forks

fnames = ['leftForks_DNAscent_forkSense.bed','rightForks_DNAscent_forkSense.bed']

for fn in fnames:
     f = open(fn,'r')

     #make a new bed file to write on
     fn_split = fn.split('.')
     f_out = open(fn_split[0]+'_stressed.bed','w')

        for line in f:

             #ignore the header lines
             if line[0] == '#':
                     f_out.write(line)
                     continue

             #split the line into a list by whitespace
             splitLine = line.rstrip().split()

             score = float(splitLine[8])

             if score >= stressThreshold:
                     f_out.write(line)

        f.close()
        f_out.close()

The following takes the output of DNAscent seeBreaks and creates histograms of the expected and observed number of analogue tracks at read ends.

import matplotlib.pyplot as plt
import sys

expected = []
observed = []

# Usage: python plotSeeBreaks.py output.seeBreaks
f = open(sys.argv[1],'r')

parsing_expected = False
parsing_observed = False

for line in f:
        line = line.strip()
        if line.startswith(">ExpectedReadEndFractions:"):
                parsing_expected = True
                parsing_observed = False
                continue
        elif line.startswith(">ObservedReadEndFractions:"):
                parsing_expected = False
                parsing_observed = True
                continue
        elif line.startswith("#") or not line:
                continue

        value = float(line)
        if parsing_expected:
                expected.append(value)
        elif parsing_observed:
                observed.append(value)

plt.figure()
plt.hist(expected, bins=30, alpha=0.5, label="Expected", color="skyblue", edgecolor="black")
plt.hist(observed, bins=30, alpha=0.5, label="Observed", color="salmon", edgecolor="black")
plt.xlabel("Fraction of Analogue Tracks at Read Ends")
plt.ylabel("Count")
plt.legend(framealpha=0.3)
plt.savefig('seeBreaks.pdf')
plt.close()