snakemake-workflows · vreuter · Mar 13, 2019 · Mar 13, 2019 · Mar 13, 2019 · Mar 13, 2019
diff --git a/prjcfg.yaml b/prjcfg.yaml
@@ -0,0 +1,6 @@
+metadata:
+  sample_annotation: samples_peppy.tsv
+  sample_subannotation: units_peppy.tsv
+
+snake_config: "config.yaml"
+
diff --git a/rules/common.smk b/rules/common.smk
@@ -1,17 +1,45 @@
 import pandas as pd
+from peppy import Project, SAMPLE_NAME_COLNAME as PEP_SAMPLE_COL
 from snakemake.utils import validate
 
-report: "../report/workflow.rst"
+
+def peppy_rename(df):
+    """ Rename peppy's column for sample name identification to snakemake's. """
+    if df is None:
+        return None
+    if "sample" in df.columns and PEP_SAMPLE_COL in df.columns:
+        raise Exception("Multiple sample identifier columns present: {}".format(", ".join(["sample", PEP_SAMPLE_COL])))
+    return df.rename({PEP_SAMPLE_COL: "sample"}, axis=1)
+
+
+def peppy_units(df):
+    """ Add unit/subsample indices to peppy a data frame.  """
+    if "unit" in df.columns:
+        return df
+    def count_names(names):
+        def go(rem, n, curr, acc):
+            if rem == []:
+                return acc + [n]
+            h, t = rem[0], rem[1:]
+            return go(t, n + 1, curr, acc) if h == curr else go(t, 1, h, acc + [n])
+        return go(names[1:], 1, names[0], []) if names else []
+    df.insert(1, "unit", [i for n in count_names(list(df[PEP_SAMPLE_COL])) for i in range(1, n + 1)])
+    return df
+
 
 ###### Config file and sample sheets #####
-configfile: "config.yaml"
+p = Project("prjcfg.yaml")
+configfile: p.snake_config
 validate(config, schema="../schemas/config.schema.yaml")
 
-samples = pd.read_table(config["samples"]).set_index("sample", drop=False)
+samples = p.sheet
+samples = peppy_rename(samples).set_index("sample", drop=False)
+
 validate(samples, schema="../schemas/samples.schema.yaml")
 
-units = pd.read_table(config["units"], dtype=str).set_index(["sample", "unit"], drop=False)
+units = peppy_units(peppy_rename(p.sample_subannotation)).set_index(["sample", "unit"], drop=False)
 units.index = units.index.set_levels([i.astype(str) for i in units.index.levels])  # enforce str in index
+
 validate(units, schema="../schemas/units.schema.yaml")
 
 # contigs in reference genome

diff --git a/samples_peppy.tsv b/samples_peppy.tsv
@@ -0,0 +1,4 @@
+sample_name
+A
+B
+
diff --git a/units_peppy.tsv b/units_peppy.tsv
@@ -0,0 +1,4 @@
+sample_name	unit	platform	fq1	fq2
+A	1	ILLUMINA	data/reads/a.chr21.1.fq	data/reads/a.chr21.2.fq
+B	1	ILLUMINA	data/reads/b.chr21.1.fq	data/reads/b.chr21.2.fq
+B	2	ILLUMINA	data/reads/b.chr21.1.fq