Skip to content

Commit d2adfa6

Browse files
committed
adding sample name and other checks
1 parent 0696f93 commit d2adfa6

File tree

1 file changed

+15
-10
lines changed

1 file changed

+15
-10
lines changed

preprocessing/preprocess.new.py

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22

33

44
__author__ = 'Francesco Asnicar (f.asnicar@unitn.it)'
5-
__version__ = '0.2.3'
6-
__date__ = '11 November 2019'
5+
__version__ = '0.2.4'
6+
__date__ = '30 December 2019'
77

88

99
import os
@@ -51,6 +51,8 @@ def read_params():
5151
p.add_argument('-i', '--input_dir', required=True, type=str, help="Path to input directory")
5252
p.add_argument('-e', '--extension', required=False, default=".fastq.gz",
5353
choices=[".fastq.gz", ".fq.gz", ".fastq.bz2", ".fq.bz2"], help="The extension of the raw input files")
54+
p.add_argument('-s', '--samplename', required=False, default="", help="Specify the sample name")
55+
5456
p.add_argument('-f', '--forward', required=False, default="R1",
5557
help="Identifier to distinguish forward reads in the input folder")
5658
p.add_argument('-r', '--reverse', required=False, default="R2",
@@ -114,12 +116,12 @@ def preflight_check(dry_run=False, verbose=False):
114116
error('preflight_check()\n{}\n{}'.format(cmd, e), exit=True)
115117

116118

117-
def get_inputs(input_dir, fwd, rev, ext, verbose=False):
119+
def get_inputs(input_dir, fwd, rev, sn, ext, verbose=False):
118120
if verbose:
119121
info('get_inputs()\n', init_new_line=True)
120122

121-
R1 = sorted(glob.glob(os.path.join(input_dir, '*{}*{}'.format(fwd, ext))))
122-
R2 = sorted(glob.glob(os.path.join(input_dir, '*{}*{}'.format(rev, ext))))
123+
R1 = sorted([os.path.join(input_dir, i) for i in os.listdir(input_dir) if (fwd in i.replace(sn, '')) and i.endswith(ext)])
124+
R2 = sorted([os.path.join(input_dir, i) for i in os.listdir(input_dir) if (rev in i.replace(sn, '')) and i.endswith(ext)])
123125

124126
return (R1, R2)
125127

@@ -197,7 +199,7 @@ def concatenate_reads_mp(x):
197199
terminating.set()
198200

199201

200-
def quality_control(input_dir, merged_r1_r2, keep_intermediate, nproc=1, dry_run=False, verbose=False):
202+
def quality_control(input_dir, merged_r1_r2, keep_intermediate, sn, nproc=1, dry_run=False, verbose=False):
201203
if dry_run or verbose:
202204
info('quality_control()\n', init_new_line=True)
203205

@@ -213,8 +215,8 @@ def quality_control(input_dir, merged_r1_r2, keep_intermediate, nproc=1, dry_run
213215
except Exception as e:
214216
error('quality_control()\ntasks: {}\n e: {}'.format(tasks, e), init_new_line=True, exit=True)
215217

216-
r1 = [i for i in qc if "R1" in i]
217-
r2 = [i for i in qc if "R2" in i]
218+
r1 = [i for i in qc if "R1" in i.replace(sn, '')]
219+
r2 = [i for i in qc if "R2" in i.replace(sn, '')]
218220

219221
if len(r1) > 1:
220222
error('quality_control(): more than one R1 detected: [{}]'.format(', '.join(r1)), exit=True)
@@ -478,7 +480,10 @@ def remove(to_remove, keep_intermediate, folder=None, dry_run=False, verbose=Fal
478480

479481
check_params(args)
480482
preflight_check(dry_run=args.dry_run, verbose=args.verbose)
481-
inputs_r1s_r2s = get_inputs(args.input_dir, args.forward, args.reverse, args.extension, verbose=args.verbose)
483+
inputs_r1s_r2s = get_inputs(args.input_dir, args.forward, args.reverse, args.samplename, args.extension, verbose=args.verbose)
484+
485+
if (len(inputs_r1s_r2s[0]) == 0) or (len(inputs_r1s_r2s[1]) == 0):
486+
error('No input files detected!\nR1s: {}\nR2s: {}'.format(inputs_r1s_r2s[0], inputs_r1s_r2s[1]), exit=True)
482487

483488
if args.dry_run or args.verbose:
484489
info('inputs_r1s: {}\n'.format('\n '.join(inputs_r1s_r2s[0])), init_new_line=True)
@@ -490,7 +495,7 @@ def remove(to_remove, keep_intermediate, folder=None, dry_run=False, verbose=Fal
490495
info('merged_r1: {}\n'.format(merged_r1_r2[0]), init_new_line=True)
491496
info('merged_r2: {}\n'.format(merged_r1_r2[1]))
492497

493-
qced_r1_r2 = quality_control(args.input_dir, merged_r1_r2, args.keep_intermediate,
498+
qced_r1_r2 = quality_control(args.input_dir, merged_r1_r2, args.keep_intermediate, args.samplename,
494499
nproc=args.nproc, dry_run=args.dry_run, verbose=args.verbose)
495500
remove(merged_r1_r2, args.keep_intermediate, folder=args.input_dir, dry_run=args.dry_run, verbose=args.verbose)
496501

0 commit comments

Comments
 (0)