From 5595f80cbd2159fec07c9cafd5ac3e2e42e9b10b Mon Sep 17 00:00:00 2001
From: aasthabharill <aasthabharill4@gmail.com>
Date: Tue, 9 Jun 2026 12:11:33 +0000
Subject: [PATCH 1/2] Add project context for datastream-to-spanner

---
 v2/datastream-to-spanner/architecture.dot   |  50 ++++
 v2/datastream-to-spanner/architecture.svg   | 243 ++++++++++++++++++++
 v2/datastream-to-spanner/project-context.md |  60 +++++
 3 files changed, 353 insertions(+)
 create mode 100644 v2/datastream-to-spanner/architecture.dot
 create mode 100644 v2/datastream-to-spanner/architecture.svg
 create mode 100644 v2/datastream-to-spanner/project-context.md
diff --git a/v2/datastream-to-spanner/architecture.dot b/v2/datastream-to-spanner/architecture.dot
new file mode 100644
index 0000000000..b7c316b976
--- /dev/null
+++ b/v2/datastream-to-spanner/architecture.dot
@@ -0,0 +1,50 @@
+digraph Architecture {
+  node [shape=box, fontname="Helvetica", style=filled, fillcolor=lightblue];
+  
+  Datastream [label="Source DB\n(via Datastream)", shape=cylinder];
+  GCS [label="Google Cloud Storage\n(AVRO/JSON)", shape=folder];
+  Spanner [label="Cloud Spanner\n(Destination & Shadow Tables)", shape=cylinder];
+  DLQ [label="Dead Letter Queue\n(GCS)", shape=folder];
+  FilteredEventsGCS [label="Filtered Events\n(GCS)", shape=folder];
+
+  subgraph cluster_dataflow {
+    label="Cloud Dataflow (DataStreamToSpanner)";
+    fontname="Helvetica-Bold";
+    style=dashed;
+    color=gray;
+
+    ReadDataStream [label="DataStreamIO\n(Read Events)"];
+    ProcessSchema [label="Process Information Schema\n(Read DDL)"];
+    ReadDLQ [label="FileBasedDeadLetterQueueReconsumer\n(Read from DLQ)"];
+    Reshuffle [label="Reshuffle\n(Random Key)"];
+    TransformEvents [label="Apply Transformation to events"];
+    WriteSpanner [label="Write events to Cloud Spanner"];
+    WriteFiltered [label="Write Filtered Events"];
+    WriteDLQRetry [label="Write To DLQ\n(Retryable Errors)"];
+    WriteDLQSevere [label="Write To DLQ2\n(Severe Errors)"];
+  }
+
+  Datastream -> GCS [label=" Change Events"];
+  GCS -> ReadDataStream;
+  Spanner -> ProcessSchema [label=" DDL"];
+  ProcessSchema -> TransformEvents [label=" SideInput: DDL View", style=dotted];
+  ProcessSchema -> WriteSpanner [label=" SideInput: DDL View", style=dotted];
+  DLQ -> ReadDLQ [label=" Failed Events"];
+  
+  ReadDataStream -> Reshuffle;
+  ReadDLQ -> Reshuffle [label=" Retryable Events"];
+  ReadDLQ -> WriteDLQSevere [label=" Permanent Errors"];
+  
+  Reshuffle -> TransformEvents [label=" JSON Records"];
+  TransformEvents -> WriteFiltered [label=" Filtered Events"];
+  WriteFiltered -> FilteredEventsGCS;
+  TransformEvents -> WriteSpanner [label=" Transformed Events"];
+  TransformEvents -> WriteDLQSevere [label=" Permanent Errors"];
+  
+  WriteSpanner -> Spanner [label=" Spanner Mutations"];
+  WriteSpanner -> WriteDLQRetry [label=" Retryable Errors"];
+  WriteSpanner -> WriteDLQSevere [label=" Permanent Errors"];
+
+  WriteDLQRetry -> DLQ;
+  WriteDLQSevere -> DLQ;
+}
diff --git a/v2/datastream-to-spanner/architecture.svg b/v2/datastream-to-spanner/architecture.svg
new file mode 100644
index 0000000000..9687a82161
--- /dev/null
+++ b/v2/datastream-to-spanner/architecture.svg
@@ -0,0 +1,243 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+ "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by graphviz version 14.1.2 (0)
+ -->
+<!-- Title: Architecture Pages: 1 -->
+<svg width="986pt" height="705pt"
+ viewBox="0.00 0.00 986.00 705.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 700.88)">
+<title>Architecture</title>
+<polygon fill="white" stroke="none" points="-4,4 -4,-700.88 982,-700.88 982,4 -4,4"/>
+<g id="clust1" class="cluster">
+<title>cluster_dataflow</title>
+<polygon fill="none" stroke="gray" stroke-dasharray="5,2" points="8,-75.91 8,-523.56 540,-523.56 540,-75.91 8,-75.91"/>
+<text xml:space="preserve" text-anchor="middle" x="274" y="-506.26" font-family="Helvetica,sans-Serif" font-weight="bold" font-size="14.00">Cloud Dataflow (DataStreamToSpanner)</text>
+</g>
+<!-- Datastream -->
+<g id="node1" class="node">
+<title>Datastream</title>
+<path fill="lightblue" stroke="black" d="M412.25,-691.94C412.25,-694.66 386.14,-696.88 354,-696.88 321.86,-696.88 295.75,-694.66 295.75,-691.94 295.75,-691.94 295.75,-647.5 295.75,-647.5 295.75,-644.78 321.86,-642.56 354,-642.56 386.14,-642.56 412.25,-644.78 412.25,-647.5 412.25,-647.5 412.25,-691.94 412.25,-691.94"/>
+<path fill="none" stroke="black" d="M412.25,-691.94C412.25,-689.21 386.14,-687 354,-687 321.86,-687 295.75,-689.21 295.75,-691.94"/>
+<text xml:space="preserve" text-anchor="middle" x="354" y="-672.17" font-family="Helvetica,sans-Serif" font-size="14.00">Source DB</text>
+<text xml:space="preserve" text-anchor="middle" x="354" y="-656.42" font-family="Helvetica,sans-Serif" font-size="14.00">(via Datastream)</text>
+</g>
+<!-- GCS -->
+<g id="node2" class="node">
+<title>GCS</title>
+<polygon fill="lightblue" stroke="black" points="429.12,-590.06 426.12,-594.06 405.12,-594.06 402.12,-590.06 278.88,-590.06 278.88,-550.56 429.12,-550.56 429.12,-590.06"/>
+<text xml:space="preserve" text-anchor="middle" x="354" y="-572.76" font-family="Helvetica,sans-Serif" font-size="14.00">Google Cloud Storage</text>
+<text xml:space="preserve" text-anchor="middle" x="354" y="-557.01" font-family="Helvetica,sans-Serif" font-size="14.00">(AVRO/JSON)</text>
+</g>
+<!-- Datastream&#45;&gt;GCS -->
+<g id="edge1" class="edge">
+<title>Datastream&#45;&gt;GCS</title>
+<path fill="none" stroke="black" d="M354,-642.1C354,-629.6 354,-614.62 354,-601.68"/>
+<polygon fill="black" stroke="black" points="357.5,-602 354,-592 350.5,-602 357.5,-602"/>
+<text xml:space="preserve" text-anchor="middle" x="396.75" y="-611.26" font-family="Times,serif" font-size="14.00"> Change Events</text>
+</g>
+<!-- ReadDataStream -->
+<g id="node6" class="node">
+<title>ReadDataStream</title>
+<polygon fill="lightblue" stroke="black" points="405.88,-491.81 302.12,-491.81 302.12,-452.31 405.88,-452.31 405.88,-491.81"/>
+<text xml:space="preserve" text-anchor="middle" x="354" y="-474.51" font-family="Helvetica,sans-Serif" font-size="14.00">DataStreamIO</text>
+<text xml:space="preserve" text-anchor="middle" x="354" y="-458.76" font-family="Helvetica,sans-Serif" font-size="14.00">(Read Events)</text>
+</g>
+<!-- GCS&#45;&gt;ReadDataStream -->
+<g id="edge2" class="edge">
+<title>GCS&#45;&gt;ReadDataStream</title>
+<path fill="none" stroke="black" d="M354,-550.3C354,-536.85 354,-518.55 354,-503.13"/>
+<polygon fill="black" stroke="black" points="357.5,-503.55 354,-493.55 350.5,-503.55 357.5,-503.55"/>
+</g>
+<!-- Spanner -->
+<g id="node3" class="node">
+<title>Spanner</title>
+<path fill="lightblue" stroke="black" d="M872.25,-125.88C872.25,-128.6 827.32,-130.81 772,-130.81 716.68,-130.81 671.75,-128.6 671.75,-125.88 671.75,-125.88 671.75,-81.44 671.75,-81.44 671.75,-78.71 716.68,-76.5 772,-76.5 827.32,-76.5 872.25,-78.71 872.25,-81.44 872.25,-81.44 872.25,-125.88 872.25,-125.88"/>
+<path fill="none" stroke="black" d="M872.25,-125.88C872.25,-123.15 827.32,-120.94 772,-120.94 716.68,-120.94 671.75,-123.15 671.75,-125.88"/>
+<text xml:space="preserve" text-anchor="middle" x="772" y="-106.11" font-family="Helvetica,sans-Serif" font-size="14.00">Cloud Spanner</text>
+<text xml:space="preserve" text-anchor="middle" x="772" y="-90.36" font-family="Helvetica,sans-Serif" font-size="14.00">(Destination &amp; Shadow Tables)</text>
+</g>
+<!-- ProcessSchema -->
+<g id="node7" class="node">
+<title>ProcessSchema</title>
+<polygon fill="lightblue" stroke="black" points="503.38,-399.81 312.62,-399.81 312.62,-360.31 503.38,-360.31 503.38,-399.81"/>
+<text xml:space="preserve" text-anchor="middle" x="408" y="-382.51" font-family="Helvetica,sans-Serif" font-size="14.00">Process Information Schema</text>
+<text xml:space="preserve" text-anchor="middle" x="408" y="-366.76" font-family="Helvetica,sans-Serif" font-size="14.00">(Read DDL)</text>
+</g>
+<!-- Spanner&#45;&gt;ProcessSchema -->
+<g id="edge3" class="edge">
+<title>Spanner&#45;&gt;ProcessSchema</title>
+<path fill="none" stroke="black" d="M747.67,-131.23C708.78,-172.63 628.96,-253.44 550,-307.81 523.33,-326.18 491.25,-342.69 464.34,-355.18"/>
+<polygon fill="black" stroke="black" points="462.94,-351.97 455.29,-359.3 465.85,-358.34 462.94,-351.97"/>
+<text xml:space="preserve" text-anchor="middle" x="652.87" y="-240.51" font-family="Times,serif" font-size="14.00"> DDL</text>
+</g>
+<!-- DLQ -->
+<g id="node4" class="node">
+<title>DLQ</title>
+<polygon fill="lightblue" stroke="black" points="337.75,-39.5 334.75,-43.5 313.75,-43.5 310.75,-39.5 206.25,-39.5 206.25,0 337.75,0 337.75,-39.5"/>
+<text xml:space="preserve" text-anchor="middle" x="272" y="-22.2" font-family="Helvetica,sans-Serif" font-size="14.00">Dead Letter Queue</text>
+<text xml:space="preserve" text-anchor="middle" x="272" y="-6.45" font-family="Helvetica,sans-Serif" font-size="14.00">(GCS)</text>
+</g>
+<!-- ReadDLQ -->
+<g id="node8" class="node">
+<title>ReadDLQ</title>
+<polygon fill="lightblue" stroke="black" points="283.62,-491.81 22.38,-491.81 22.38,-452.31 283.62,-452.31 283.62,-491.81"/>
+<text xml:space="preserve" text-anchor="middle" x="153" y="-474.51" font-family="Helvetica,sans-Serif" font-size="14.00">FileBasedDeadLetterQueueReconsumer</text>
+<text xml:space="preserve" text-anchor="middle" x="153" y="-458.76" font-family="Helvetica,sans-Serif" font-size="14.00">(Read from DLQ)</text>
+</g>
+<!-- DLQ&#45;&gt;ReadDLQ -->
+<g id="edge6" class="edge">
+<title>DLQ&#45;&gt;ReadDLQ</title>
+<path fill="none" stroke="black" d="M338.11,-19.89C476.2,-19.19 790,-23.59 881,-75.91 893.64,-83.17 900,-88.07 900,-102.66 900,-381.06 900,-381.06 900,-381.06 900,-446.25 402.72,-444.14 295.14,-452.39"/>
+<polygon fill="black" stroke="black" points="294.9,-448.89 285.28,-453.33 295.57,-455.86 294.9,-448.89"/>
+<text xml:space="preserve" text-anchor="middle" x="939" y="-240.51" font-family="Times,serif" font-size="14.00"> Failed Events</text>
+</g>
+<!-- FilteredEventsGCS -->
+<g id="node5" class="node">
+<title>FilteredEventsGCS</title>
+<polygon fill="lightblue" stroke="black" points="654,-123.41 651,-127.41 630,-127.41 627,-123.41 548,-123.41 548,-83.91 654,-83.91 654,-123.41"/>
+<text xml:space="preserve" text-anchor="middle" x="601" y="-106.11" font-family="Helvetica,sans-Serif" font-size="14.00">Filtered Events</text>
+<text xml:space="preserve" text-anchor="middle" x="601" y="-90.36" font-family="Helvetica,sans-Serif" font-size="14.00">(GCS)</text>
+</g>
+<!-- Reshuffle -->
+<g id="node9" class="node">
+<title>Reshuffle</title>
+<polygon fill="lightblue" stroke="black" points="222.88,-399.81 119.12,-399.81 119.12,-360.31 222.88,-360.31 222.88,-399.81"/>
+<text xml:space="preserve" text-anchor="middle" x="171" y="-382.51" font-family="Helvetica,sans-Serif" font-size="14.00">Reshuffle</text>
+<text xml:space="preserve" text-anchor="middle" x="171" y="-366.76" font-family="Helvetica,sans-Serif" font-size="14.00">(Random Key)</text>
+</g>
+<!-- ReadDataStream&#45;&gt;Reshuffle -->
+<g id="edge7" class="edge">
+<title>ReadDataStream&#45;&gt;Reshuffle</title>
+<path fill="none" stroke="black" d="M324.25,-451.85C307.02,-441.14 284.71,-427.95 264,-417.81 254.39,-413.11 244.03,-408.54 233.82,-404.32"/>
+<polygon fill="black" stroke="black" points="235.27,-401.13 224.68,-400.62 232.64,-407.61 235.27,-401.13"/>
+</g>
+<!-- TransformEvents -->
+<g id="node10" class="node">
+<title>TransformEvents</title>
+<polygon fill="lightblue" stroke="black" points="355.75,-307.81 152.25,-307.81 152.25,-271.81 355.75,-271.81 355.75,-307.81"/>
+<text xml:space="preserve" text-anchor="middle" x="254" y="-284.39" font-family="Helvetica,sans-Serif" font-size="14.00">Apply Transformation to events</text>
+</g>
+<!-- ProcessSchema&#45;&gt;TransformEvents -->
+<g id="edge4" class="edge">
+<title>ProcessSchema&#45;&gt;TransformEvents</title>
+<path fill="none" stroke="black" stroke-dasharray="1,5" d="M312.33,-361.71C300.38,-356.77 288.97,-350.44 279.25,-342.31 271.84,-336.11 266.42,-327.22 262.54,-318.55"/>
+<polygon fill="black" stroke="black" points="265.85,-317.4 258.98,-309.34 259.32,-319.93 265.85,-317.4"/>
+<text xml:space="preserve" text-anchor="middle" x="341.12" y="-329.01" font-family="Times,serif" font-size="14.00"> SideInput: DDL View</text>
+</g>
+<!-- WriteSpanner -->
+<g id="node11" class="node">
+<title>WriteSpanner</title>
+<polygon fill="lightblue" stroke="black" points="371.88,-219.31 172.12,-219.31 172.12,-183.31 371.88,-183.31 371.88,-219.31"/>
+<text xml:space="preserve" text-anchor="middle" x="272" y="-195.89" font-family="Helvetica,sans-Serif" font-size="14.00">Write events to Cloud Spanner</text>
+</g>
+<!-- ProcessSchema&#45;&gt;WriteSpanner -->
+<g id="edge5" class="edge">
+<title>ProcessSchema&#45;&gt;WriteSpanner</title>
+<path fill="none" stroke="black" stroke-dasharray="1,5" d="M413.73,-360.14C421.46,-330.48 431.22,-272.76 403,-237.31 398.82,-232.06 390,-227.41 378.81,-223.36"/>
+<polygon fill="black" stroke="black" points="379.97,-220.05 369.38,-220.27 377.79,-226.71 379.97,-220.05"/>
+<text xml:space="preserve" text-anchor="middle" x="483.73" y="-284.76" font-family="Times,serif" font-size="14.00"> SideInput: DDL View</text>
+</g>
+<!-- ReadDLQ&#45;&gt;Reshuffle -->
+<g id="edge8" class="edge">
+<title>ReadDLQ&#45;&gt;Reshuffle</title>
+<path fill="none" stroke="black" d="M156.82,-451.98C159.18,-440.16 162.27,-424.74 164.95,-411.29"/>
+<polygon fill="black" stroke="black" points="168.36,-412.1 166.89,-401.61 161.5,-410.73 168.36,-412.1"/>
+<text xml:space="preserve" text-anchor="middle" x="211.99" y="-421.01" font-family="Times,serif" font-size="14.00"> Retryable Events</text>
+</g>
+<!-- WriteDLQSevere -->
+<g id="node14" class="node">
+<title>WriteDLQSevere</title>
+<polygon fill="lightblue" stroke="black" points="162.12,-123.41 53.88,-123.41 53.88,-83.91 162.12,-83.91 162.12,-123.41"/>
+<text xml:space="preserve" text-anchor="middle" x="108" y="-106.11" font-family="Helvetica,sans-Serif" font-size="14.00">Write To DLQ2</text>
+<text xml:space="preserve" text-anchor="middle" x="108" y="-90.36" font-family="Helvetica,sans-Serif" font-size="14.00">(Severe Errors)</text>
+</g>
+<!-- ReadDLQ&#45;&gt;WriteDLQSevere -->
+<g id="edge9" class="edge">
+<title>ReadDLQ&#45;&gt;WriteDLQSevere</title>
+<path fill="none" stroke="black" d="M134.47,-451.99C108.15,-423.43 61.18,-366.61 43.25,-307.81 22.54,-239.89 23.79,-210.48 59,-148.81 62.54,-142.61 67.26,-136.81 72.38,-131.59"/>
+<polygon fill="black" stroke="black" points="74.61,-134.29 79.53,-124.91 69.83,-129.18 74.61,-134.29"/>
+<text xml:space="preserve" text-anchor="middle" x="93.12" y="-284.76" font-family="Times,serif" font-size="14.00"> Permanent Errors</text>
+</g>
+<!-- Reshuffle&#45;&gt;TransformEvents -->
+<g id="edge10" class="edge">
+<title>Reshuffle&#45;&gt;TransformEvents</title>
+<path fill="none" stroke="black" d="M166.57,-360.05C165.13,-349.15 165.33,-335.7 172,-325.81 174.85,-321.58 178.33,-317.86 182.22,-314.59"/>
+<polygon fill="black" stroke="black" points="183.97,-317.64 190.09,-308.99 179.91,-311.93 183.97,-317.64"/>
+<text xml:space="preserve" text-anchor="middle" x="214" y="-329.01" font-family="Times,serif" font-size="14.00"> JSON Records</text>
+</g>
+<!-- TransformEvents&#45;&gt;WriteSpanner -->
+<g id="edge13" class="edge">
+<title>TransformEvents&#45;&gt;WriteSpanner</title>
+<path fill="none" stroke="black" d="M199.71,-271.5C182.52,-262.99 170.23,-251.49 180,-237.31 182.92,-233.08 186.42,-229.36 190.32,-226.09"/>
+<polygon fill="black" stroke="black" points="192.08,-229.14 198.22,-220.5 188.04,-223.43 192.08,-229.14"/>
+<text xml:space="preserve" text-anchor="middle" x="237" y="-240.51" font-family="Times,serif" font-size="14.00"> Transformed Events</text>
+</g>
+<!-- WriteFiltered -->
+<g id="node12" class="node">
+<title>WriteFiltered</title>
+<polygon fill="lightblue" stroke="black" points="531.62,-219.31 390.38,-219.31 390.38,-183.31 531.62,-183.31 531.62,-219.31"/>
+<text xml:space="preserve" text-anchor="middle" x="461" y="-195.89" font-family="Helvetica,sans-Serif" font-size="14.00">Write Filtered Events</text>
+</g>
+<!-- TransformEvents&#45;&gt;WriteFiltered -->
+<g id="edge11" class="edge">
+<title>TransformEvents&#45;&gt;WriteFiltered</title>
+<path fill="none" stroke="black" d="M295.39,-271.52C328,-257.89 373.67,-238.81 408.79,-224.13"/>
+<polygon fill="black" stroke="black" points="410.05,-227.4 417.93,-220.31 407.35,-220.94 410.05,-227.4"/>
+<text xml:space="preserve" text-anchor="middle" x="419.17" y="-240.51" font-family="Times,serif" font-size="14.00"> Filtered Events</text>
+</g>
+<!-- TransformEvents&#45;&gt;WriteDLQSevere -->
+<g id="edge14" class="edge">
+<title>TransformEvents&#45;&gt;WriteDLQSevere</title>
+<path fill="none" stroke="black" d="M151.85,-275.45C118,-265.61 83.76,-248.62 63.25,-219.31 44.56,-192.61 64.02,-157.03 82.81,-132.63"/>
+<polygon fill="black" stroke="black" points="85.38,-135.02 88.95,-125.04 79.94,-130.62 85.38,-135.02"/>
+<text xml:space="preserve" text-anchor="middle" x="113.12" y="-196.26" font-family="Times,serif" font-size="14.00"> Permanent Errors</text>
+</g>
+<!-- WriteSpanner&#45;&gt;Spanner -->
+<g id="edge15" class="edge">
+<title>WriteSpanner&#45;&gt;Spanner</title>
+<path fill="none" stroke="black" d="M372.36,-184.17C448.83,-171.47 556.94,-152.68 660.42,-131.31"/>
+<polygon fill="black" stroke="black" points="660.82,-134.8 669.9,-129.34 659.4,-127.95 660.82,-134.8"/>
+<text xml:space="preserve" text-anchor="middle" x="620.44" y="-152.01" font-family="Times,serif" font-size="14.00"> Spanner Mutations</text>
+</g>
+<!-- WriteDLQRetry -->
+<g id="node13" class="node">
+<title>WriteDLQRetry</title>
+<polygon fill="lightblue" stroke="black" points="333.62,-123.41 210.38,-123.41 210.38,-83.91 333.62,-83.91 333.62,-123.41"/>
+<text xml:space="preserve" text-anchor="middle" x="272" y="-106.11" font-family="Helvetica,sans-Serif" font-size="14.00">Write To DLQ</text>
+<text xml:space="preserve" text-anchor="middle" x="272" y="-90.36" font-family="Helvetica,sans-Serif" font-size="14.00">(Retryable Errors)</text>
+</g>
+<!-- WriteSpanner&#45;&gt;WriteDLQRetry -->
+<g id="edge16" class="edge">
+<title>WriteSpanner&#45;&gt;WriteDLQRetry</title>
+<path fill="none" stroke="black" d="M272,-183.21C272,-169.85 272,-150.94 272,-135.02"/>
+<polygon fill="black" stroke="black" points="275.5,-135.09 272,-125.09 268.5,-135.09 275.5,-135.09"/>
+<text xml:space="preserve" text-anchor="middle" x="318.88" y="-152.01" font-family="Times,serif" font-size="14.00"> Retryable Errors</text>
+</g>
+<!-- WriteSpanner&#45;&gt;WriteDLQSevere -->
+<g id="edge17" class="edge">
+<title>WriteSpanner&#45;&gt;WriteDLQSevere</title>
+<path fill="none" stroke="black" d="M205.17,-182.87C192.42,-178.15 179.54,-172.35 168.25,-165.31 154.09,-156.5 140.76,-143.73 130.25,-132.19"/>
+<polygon fill="black" stroke="black" points="133.02,-130.04 123.79,-124.82 127.75,-134.65 133.02,-130.04"/>
+<text xml:space="preserve" text-anchor="middle" x="218.12" y="-152.01" font-family="Times,serif" font-size="14.00"> Permanent Errors</text>
+</g>
+<!-- WriteFiltered&#45;&gt;FilteredEventsGCS -->
+<g id="edge12" class="edge">
+<title>WriteFiltered&#45;&gt;FilteredEventsGCS</title>
+<path fill="none" stroke="black" d="M512.59,-182.94C523.76,-178.07 535.16,-172.19 545,-165.31 557.98,-156.24 570.18,-143.7 579.86,-132.38"/>
+<polygon fill="black" stroke="black" points="582.44,-134.76 586.09,-124.82 577.03,-130.31 582.44,-134.76"/>
+</g>
+<!-- WriteDLQRetry&#45;&gt;DLQ -->
+<g id="edge18" class="edge">
+<title>WriteDLQRetry&#45;&gt;DLQ</title>
+<path fill="none" stroke="black" d="M272,-83.66C272,-74.01 272,-62.06 272,-51.17"/>
+<polygon fill="black" stroke="black" points="275.5,-51.44 272,-41.44 268.5,-51.44 275.5,-51.44"/>
+</g>
+<!-- WriteDLQSevere&#45;&gt;DLQ -->
+<g id="edge19" class="edge">
+<title>WriteDLQSevere&#45;&gt;DLQ</title>
+<path fill="none" stroke="black" d="M146.45,-83.45C169.4,-71.99 198.75,-57.33 223.31,-45.07"/>
+<polygon fill="black" stroke="black" points="224.77,-48.25 232.15,-40.65 221.64,-41.99 224.77,-48.25"/>
+</g>
+</g>
+</svg>
diff --git a/v2/datastream-to-spanner/project-context.md b/v2/datastream-to-spanner/project-context.md
new file mode 100644
index 0000000000..83cb0e3df1
--- /dev/null
+++ b/v2/datastream-to-spanner/project-context.md
@@ -0,0 +1,60 @@
+# Project Context: Datastream to Spanner
+
+ <!-- AI Agent: Please parse this document to understand the project's context before making changes. -->
+
+## Overview
+
+*   **Core Intent:** The Dataflow template is a streaming CDC (Change Data Capture) migration pipeline that moves real-time database changes from Datastream to Cloud Spanner. It's meant to reduce application downtime by applying in-flight changes while bulk data loads happen.
+*   **Primary Users:** Database administrators, migration engineers, and customers moving databases to Cloud Spanner.
+*   **Critical SLOs/Guarantees:** Must ensure eventual consistency with the source database by never allowing older events to overwrite newer ones (preserving commit order).
+*   **Terminology:**
+    *   **Change Event (CE):** A DML change (insert/update/delete) that contains the full row data.
+    *   **Shadow Table:** A companion table created alongside each Spanner destination table to keep track of versioning metadata (like an Oracle SCN) for each primary key.
+    *   **DLQ:** Dead Letter Queue (for failed records).
+    *   **Oracle SCN:** System Change Number. A version number ensuring events are committed in the right order.
+    *   **Schema Override:** A mechanism to override the schema of the destination database.
+
+## Technical Details
+
+*   **Tech Stack & Versions:**
+    *   **Languages:** Java 17
+    *   **Frameworks/Libraries:** Apache Beam, GCP Spanner SDK, Datastream API Client.
+    *   **Key Technologies:** Cloud Spanner, Cloud Dataflow, Datastream, Google Cloud Storage (GCS)
+*   **Code Location:** `v2/datastream-to-spanner`
+*   **Data Flow:** Source Database -> Datastream -> Google Cloud Storage (Avro/JSON) -> Cloud Dataflow Pipeline (parses, transforms, schema validates) -> Cloud Spanner. Failed events are sent to a Dead Letter Queue (DLQ) in GCS for recycling/retrying.
+*   **Project Structure (Logical Architecture Mapping):**
+    *   `v2/datastream-to-spanner/src/main/java/com/google/cloud/teleport/v2/templates`: Core dataflow pipeline logic (`DataStreamToSpanner.java`) and DoFns.
+    *   `v2/datastream-to-spanner/src/main/java/com/google/cloud/teleport/v2/templates/datastream`: Datastream mapping, JSON/Avro parsing, and dialect-specific parsing contexts.
+    *   `v2/datastream-to-spanner/src/main/java/com/google/cloud/teleport/v2/templates/spanner`: Spanner-specific logic including `SpannerTransactionWriter` and Schema overrides parser.
+    *   `v2/datastream-to-spanner/src/main/java/com/google/cloud/teleport/v2/templates/transform`: Custom transformation and processing logic for change events.
+    *   `v2/datastream-to-spanner/src/test/java/com/google/cloud/teleport/v2/templates`: Unit and Integration tests.
+*   **Build/Run Commands:**
+    See the `README.md` file for instructions on building and running the pipeline.
+
+## Documentation
+
+*   **Architecture Diagram & Dependency Tree:** [architecture.svg](architecture.svg) (Source: `architecture.dot`).
+    *   **Rule:** Always keep the `.dot` and `.svg` files in sync. If you modify the architecture, you MUST regenerate the `.svg` from the `.dot` file.
+
+## AI Agent Tips
+
+*   **Common Tasks:** Adding new dialect support for Datastream, improving retry logic for the DLQ, adding transformations or metrics for the Dataflow pipeline.
+*   **Coding Standards & Best Practices:**
+    *   Individual CEs are processed separately for parallel scaling, rather than grouping them into the original source transactions. Consistency is managed using lateness checks on the Shadow Tables.
+    *   **Avoid Serial Processing:** Do not attempt to group events by transaction or serially order them. The approach relies on parallel workers, taking advantage of Cloud Dataflow's scale.
+    *   **Avoid GroupBy:** Do not use `GroupByKey` or internal worker state to filter stale events before writing. It doesn't scale well and complicates state recovery. Always use Shadow Tables for the lateness check.
+    *   Because CE writes are idempotent and protected by version checks, the template relies heavily on automatic retries for failed events, reducing the complexity of referential integrity (e.g., when a child arrives before a parent).
+    *   **Referential Integrity:** For foreign keys and interleaved tables, rely purely on the retry mechanisms. A child event arriving before its parent will fail, but will eventually be written when it is retried after the parent succeeds.
+    *   Schema migration must be done *prior* to starting this pipeline; it does not process or replicate DDL events.
+*   **Testing Frameworks & Guidelines:**
+    *   **Frameworks:** JUnit 4, Mockito for mocking dependencies.
+    *   **Rules:** Ensure adequate UT coverage for new logic. Integration tests should be placed in the respective `*IT.java` classes with robust wait conditions.
+*   **Areas to be Careful (Gotchas):**
+    *   Lateness checks on Shadow Tables are critical; bugs here can lead to data inconsistency.
+    *   DLQ retry logic (both `retryDLQ` and `retryAllDLQ` modes) handles data integrity on errors. Modifying it must be done carefully to prevent infinite loops or skipped events.
+    *   **Fatal Errors:** Unexpected/fatal errors (like type conversion failures) should not be endlessly retried. Ensure any new exceptions are properly routed to the severe DLQ bucket.
+    *   **Version Overflow:** Be mindful of edge cases in version ordering (e.g. if the Oracle SCN exceeds limits and restarts at zero). Ensure comparisons in `ChangeEventSequence` remain robust against edge case overflows.
+    *   **Data Size Limits:** Datastream enforces a 3MB size limit per Change Event. Ensure no individual rows exceed this.
+*   **Example PRs:**
+    *   [PR #3035](https://github.com/GoogleCloudPlatform/DataflowTemplates/pull/3035) - [datastream-to-spanner] Unable to convert field timestamp to long
+    *   [PR #2867](https://github.com/GoogleCloudPlatform/DataflowTemplates/pull/2867) - changed mysql event ordering in datastream to spanner

From a1147144732f8b3aa7310108f64f600b47475dab Mon Sep 17 00:00:00 2001
From: aasthabharill <aasthabharill4@gmail.com>
Date: Wed, 10 Jun 2026 04:43:11 +0000
Subject: [PATCH 2/2] Clarify Datastream/Dataflow boundary

---
 v2/datastream-to-spanner/project-context.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/v2/datastream-to-spanner/project-context.md b/v2/datastream-to-spanner/project-context.md
index 83cb0e3df1..776f1003b7 100644
--- a/v2/datastream-to-spanner/project-context.md
+++ b/v2/datastream-to-spanner/project-context.md
@@ -4,7 +4,7 @@
 
 ## Overview
 
-*   **Core Intent:** The Dataflow template is a streaming CDC (Change Data Capture) migration pipeline that moves real-time database changes from Datastream to Cloud Spanner. It's meant to reduce application downtime by applying in-flight changes while bulk data loads happen.
+*   **Core Intent:** The Dataflow template is a streaming CDC (Change Data Capture) migration pipeline that applies real-time database changes to Cloud Spanner. **Important distinction:** This template does *not* read directly from the source database or pipe data to Datastream. Instead, Datastream independently reads from the source DB and writes the change events as Avro (or JSON) files to a GCS bucket. This Dataflow template then consumes those files from GCS, converts the Avro records into internal JSON representations, and applies the changes to Spanner. It's meant to reduce application downtime by applying in-flight changes while bulk data loads happen.
 *   **Primary Users:** Database administrators, migration engineers, and customers moving databases to Cloud Spanner.
 *   **Critical SLOs/Guarantees:** Must ensure eventual consistency with the source database by never allowing older events to overwrite newer ones (preserving commit order).
 *   **Terminology:**
@@ -21,7 +21,7 @@
     *   **Frameworks/Libraries:** Apache Beam, GCP Spanner SDK, Datastream API Client.
     *   **Key Technologies:** Cloud Spanner, Cloud Dataflow, Datastream, Google Cloud Storage (GCS)
 *   **Code Location:** `v2/datastream-to-spanner`
-*   **Data Flow:** Source Database -> Datastream -> Google Cloud Storage (Avro/JSON) -> Cloud Dataflow Pipeline (parses, transforms, schema validates) -> Cloud Spanner. Failed events are sent to a Dead Letter Queue (DLQ) in GCS for recycling/retrying.
+*   **Data Flow:** Source Database -> Datastream -> Google Cloud Storage (Avro/JSON) -> Cloud Dataflow Pipeline (consumes Avro/JSON from GCS, converts Avro to JSON, transforms, schema validates) -> Cloud Spanner. Failed events are sent to a Dead Letter Queue (DLQ) in GCS for recycling/retrying.
 *   **Project Structure (Logical Architecture Mapping):**
     *   `v2/datastream-to-spanner/src/main/java/com/google/cloud/teleport/v2/templates`: Core dataflow pipeline logic (`DataStreamToSpanner.java`) and DoFns.
     *   `v2/datastream-to-spanner/src/main/java/com/google/cloud/teleport/v2/templates/datastream`: Datastream mapping, JSON/Avro parsing, and dialect-specific parsing contexts.