diff --git a/prjcfg.yaml b/prjcfg.yaml new file mode 100644 index 0000000..cee7a6b --- /dev/null +++ b/prjcfg.yaml @@ -0,0 +1,6 @@ +metadata: + sample_annotation: samples_peppy.tsv + sample_subannotation: units_peppy.tsv + +snake_config: "config.yaml" + diff --git a/rules/common.smk b/rules/common.smk index f6b9635..35414ab 100644 --- a/rules/common.smk +++ b/rules/common.smk @@ -1,17 +1,45 @@ import pandas as pd +from peppy import Project, SAMPLE_NAME_COLNAME as PEP_SAMPLE_COL from snakemake.utils import validate -report: "../report/workflow.rst" + +def peppy_rename(df): + """ Rename peppy's column for sample name identification to snakemake's. """ + if df is None: + return None + if "sample" in df.columns and PEP_SAMPLE_COL in df.columns: + raise Exception("Multiple sample identifier columns present: {}".format(", ".join(["sample", PEP_SAMPLE_COL]))) + return df.rename({PEP_SAMPLE_COL: "sample"}, axis=1) + + +def peppy_units(df): + """ Add unit/subsample indices to peppy a data frame. """ + if "unit" in df.columns: + return df + def count_names(names): + def go(rem, n, curr, acc): + if rem == []: + return acc + [n] + h, t = rem[0], rem[1:] + return go(t, n + 1, curr, acc) if h == curr else go(t, 1, h, acc + [n]) + return go(names[1:], 1, names[0], []) if names else [] + df.insert(1, "unit", [i for n in count_names(list(df[PEP_SAMPLE_COL])) for i in range(1, n + 1)]) + return df + ###### Config file and sample sheets ##### -configfile: "config.yaml" +p = Project("prjcfg.yaml") +configfile: p.snake_config validate(config, schema="../schemas/config.schema.yaml") -samples = pd.read_table(config["samples"]).set_index("sample", drop=False) +samples = p.sheet +samples = peppy_rename(samples).set_index("sample", drop=False) + validate(samples, schema="../schemas/samples.schema.yaml") -units = pd.read_table(config["units"], dtype=str).set_index(["sample", "unit"], drop=False) +units = peppy_units(peppy_rename(p.sample_subannotation)).set_index(["sample", "unit"], drop=False) units.index = units.index.set_levels([i.astype(str) for i in units.index.levels]) # enforce str in index + validate(units, schema="../schemas/units.schema.yaml") # contigs in reference genome diff --git a/samples_peppy.tsv b/samples_peppy.tsv new file mode 100644 index 0000000..cc9db9b --- /dev/null +++ b/samples_peppy.tsv @@ -0,0 +1,4 @@ +sample_name +A +B + diff --git a/units_peppy.tsv b/units_peppy.tsv new file mode 100644 index 0000000..98f46b9 --- /dev/null +++ b/units_peppy.tsv @@ -0,0 +1,4 @@ +sample_name unit platform fq1 fq2 +A 1 ILLUMINA data/reads/a.chr21.1.fq data/reads/a.chr21.2.fq +B 1 ILLUMINA data/reads/b.chr21.1.fq data/reads/b.chr21.2.fq +B 2 ILLUMINA data/reads/b.chr21.1.fq