Skip to content
This repository has been archived by the owner on Nov 9, 2023. It is now read-only.

Join paired ends updates #1726

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
72 changes: 68 additions & 4 deletions qiime/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
__author__ = "Rob Knight"
__copyright__ = "Copyright 2011, The QIIME Project"
__credits__ = ["Rob Knight", "Justin Kuczynski", "Jeremy Widmann",
"Antonio Gonzalez Pena", "Daniel McDonald", "Jai Ram Rideout"]
"Antonio Gonzalez Pena", "Daniel McDonald", "Jai Ram Rideout",
"Mike Robeson"]
# remember to add yourself if you make changes
__license__ = "GPL"
__version__ = "1.8.0-dev"
Expand All @@ -14,13 +15,14 @@
from numpy import asarray, isnan, log10, median
from StringIO import StringIO
from re import compile, sub
from os import walk
from os import walk, path
from os.path import join, splitext, exists, isfile, abspath

from skbio.io import read
from skbio.sequence import BiologicalSequence
from biom.table import Table

from qiime.util import get_qiime_library_version, load_qiime_config
from qiime.util import get_qiime_library_version, load_qiime_config, qiime_open
from qiime.colors import data_color_hsv

"""Contains formatters for the files we expect to encounter in 454 workflow.
Expand Down Expand Up @@ -601,7 +603,7 @@ def write_Fasta_from_name_seq_pairs(name_seqs, fh):
raise ValueError("Need open file handle to write to.")

for (name, seq) in name_seqs:
fh.write("%s\n" % BiologicalSequence(seq, id=name).to_fasta())
BiologicalSequence(seq, id=name).write(fh, format='fasta')


def illumina_data_to_fastq(record_data, number_of_bases=None):
Expand Down Expand Up @@ -895,6 +897,68 @@ def format_fastq_record(label,
return "@%s\n%s\n+\n%s\n" % (label, seq, qual)


def write_synced_barcodes_fastq(joined_fp, index_fp, qual_score_variant='illumina1.8'):
"""Writes new index file based on surviving assembled paired-ends.
-joined_fp : file path to paired-end assembled fastq file
-index_fp : file path to index / barcode reads fastq file
-qual_score_variant : format of fastq quality scores. Can be
\'illumina1.3\' or \'illumina1.8\'

This function iterates through the joined reads file and index file.
Only those index-reads within the file at index_fp, that have headers
matching those within the joined-pairs at joined_fp, are written
to file.

Always forces output to be: illumina1.8

WARNING: Assumes reads are in the same order in both files,
except for cases in which the corresponding
read in the joined_fp file is missing (i.e. pairs
failed to assemble).

"""

# open files (handles normal / gzipped data)
jh = qiime_open(joined_fp)
ih = qiime_open(index_fp)

# base new index file name on joined paired-end file name:
j_path, ext = path.splitext(joined_fp)
filtered_bc_outfile_path = j_path + '_barcodes.fastq'
fbc_fh = open(filtered_bc_outfile_path, 'w')

# Set up iterators
index_fastq_iter = read(ih, format='fastq', variant=qual_score_variant)
joined_fastq_iter = read(jh, format='fastq', variant=qual_score_variant)

# Write barcodes / index reads that we observed within
# the joined paired-ends. Warn if index and joined data
# are not in order.
for joined in joined_fastq_iter:
index = index_fastq_iter.next()
while joined.id != index.id:
try:
index = index_fastq_iter.next()
except StopIteration:
raise StopIteration("\n\nReached end of index-reads file" +
" before iterating through joined paired-end-reads file!" +
" Except for missing paired-end reads that did not survive" +
" assembly, your index and paired-end reads files must be in" +
" the same order! Also, check that the index-reads and" +
" paired-end reads have identical headers. The last joined" +
" paired-end ID processed was:\n\'%s\'\n" % (joined.id))
else:
# force output to illumina1.8
index.write(fbc_fh, format='fastq', variant='illumina1.8')

ih.close()
jh.close()
fbc_fh.close()

return filtered_bc_outfile_path



HTML_LINES_INIT = """<html>
<head>

Expand Down
56 changes: 0 additions & 56 deletions qiime/join_paired_ends.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,8 @@
__maintainer__ = "Mike Robeson"
__email__ = "robesonms@ornl.gov"

from skbio.parse.sequences import parse_fastq
from skbio.format.sequences import format_fastq_record
from bfillings.fastq_join import FastqJoin, join_paired_end_reads_fastqjoin
from bfillings.seqprep import SeqPrep, join_paired_end_reads_seqprep
from qiime.util import qiime_open
import os
import gzip

Expand All @@ -21,57 +18,4 @@
'SeqPrep': join_paired_end_reads_seqprep}


def write_synced_barcodes_fastq(joined_fp, index_fp):
"""Writes new index file based on surviving assembled paired-ends.
-joined_fp : file path to paired-end assembled fastq file
-index_fp : file path to index / barcode reads fastq file

This function iterates through the joined reads file and index file.
Only those index-reads within the file at index_fp, that have headers
matching those within the joined-pairs at joined_fp, are written
to file.

WARNING: Assumes reads are in the same order in both files,
except for cases in which the corresponding
read in the joined_fp file is missing (i.e. pairs
failed to assemble).

"""

# open files (handles normal / gzipped data)
jh = qiime_open(joined_fp)
ih = qiime_open(index_fp)

# base new index file name on joined paired-end file name:
j_path, ext = os.path.splitext(joined_fp)
filtered_bc_outfile_path = j_path + '_barcodes.fastq'
fbc_fh = open(filtered_bc_outfile_path, 'w')

# Set up iterators
index_fastq_iter = parse_fastq(ih, strict=False)
joined_fastq_iter = parse_fastq(jh, strict=False)
# Write barcodes / index reads that we observed within
# the joined paired-ends. Warn if index and joined data
# are not in order.
for joined_label, joined_seq, joined_qual in joined_fastq_iter:
index_label, index_seq, index_qual = index_fastq_iter.next()
while joined_label != index_label:
try:
index_label, index_seq, index_qual = index_fastq_iter.next()
except StopIteration:
raise StopIteration("\n\nReached end of index-reads file" +
" before iterating through joined paired-end-reads file!" +
" Except for missing paired-end reads that did not survive" +
" assembly, your index and paired-end reads files must be in" +
" the same order! Also, check that the index-reads and" +
" paired-end reads have identical headers. The last joined" +
" paired-end ID processed was:\n\'%s\'\n" % (joined_label))
else:
fastq_string = format_fastq_record(index_label, index_seq, index_qual)
fbc_fh.write(fastq_string)

ih.close()
jh.close()
fbc_fh.close()

return filtered_bc_outfile_path
16 changes: 12 additions & 4 deletions scripts/join_paired_ends.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
__email__ = "robesonms@ornl.gov"

from qiime.join_paired_ends import (join_method_names,
join_method_constructors,
write_synced_barcodes_fastq)
join_method_constructors)
from qiime.format import write_synced_barcodes_fastq
from qiime.util import (parse_command_line_parameters, get_options_lookup,
make_option, load_qiime_config, create_dir)
import os
Expand Down Expand Up @@ -114,7 +114,14 @@
help='Only applies to SeqPrep method, otherwise ignored.' +
' Set if input reads are in phred+64 format. Output will '
'always be phred+33. [default: %default]',
default=False)]
default=False),
make_option('-q', '--qual_score_variant',
help=' Input format of fastq data. Can be: \'illumina1.3\' or'+
' \'illumina1.8\'. Only used if using the \'-b\' option.' +
' Output will always be \'illumina1.8\'.' +
'[default: %default]. For more info see: '+
'http://scikit-bio.org/docs/latest/generated/skbio.io.fastq.html',
default='illumina1.8')]

script_info['version'] = __version__

Expand All @@ -127,6 +134,7 @@ def main():
forward_reads_fp = opts.forward_reads_fp
reverse_reads_fp = opts.reverse_reads_fp
pe_join_method = opts.pe_join_method
qual_score_variant=opts.qual_score_variant
output_dir = opts.output_dir
# fastq-join only options:
perc_max_diff = opts.perc_max_diff
Expand Down Expand Up @@ -168,7 +176,7 @@ def main():
if opts.index_reads_fp:
index_reads = opts.index_reads_fp
assembly_fp = paths['Assembled'] # grab joined-pairs output path
write_synced_barcodes_fastq(assembly_fp, index_reads)
write_synced_barcodes_fastq(assembly_fp, index_reads, qual_score_variant=qual_score_variant)


if __name__ == "__main__":
Expand Down
Loading