diff --git a/CHANGELOG.md b/CHANGELOG.md index 31bb690..712f000 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ ## [Unreleased] +### Fixed + +- Fixed incorrect labeling when multiple filters match the same token. Each occurrence is now labeled by the filter that matched it (e.g., `"My name is Austin, and I live in Austin TX."` now redacts to `"My name is [PERSON_1], and I live in [LOCATION_1] TX."` instead of labeling both occurrences with the last filter to match). + ## [1.0.1] - 2026-04-16 ### Fixed diff --git a/lib/top_secret/text.rb b/lib/top_secret/text.rb index 14654b6..93a9588 100644 --- a/lib/top_secret/text.rb +++ b/lib/top_secret/text.rb @@ -8,6 +8,7 @@ require_relative "text/scan_result" require_relative "text/global_mapping" require_relative "text/label_sequence" +require_relative "text/substitution" module TopSecret # Processes text to identify and redact sensitive information using configured filters. @@ -214,13 +215,7 @@ def build_mapping(values, label:) # # @return [void] def substitute_text - return if mapping.empty? - - value_to_label = mapping.each_with_object({}) do |(filter, value), hash| - hash[value] = "[#{filter}]" - end - pattern = Regexp.union(value_to_label.keys) - output.gsub!(pattern, value_to_label) + Substitution.new(mapping).apply!(output) end # Collects all filters to apply: default filters with overrides plus custom filters diff --git a/lib/top_secret/text/substitution.rb b/lib/top_secret/text/substitution.rb new file mode 100644 index 0000000..259c7e5 --- /dev/null +++ b/lib/top_secret/text/substitution.rb @@ -0,0 +1,66 @@ +# frozen_string_literal: true + +module TopSecret + class Text + # Replaces matched values in text with their label placeholders. + # + # When a value is matched by a single filter, every occurrence is replaced + # with that label. When a value is matched by multiple filters, occurrences + # are labeled in filter order; if there are more labels than occurrences, + # the later labels win (preserving "custom filter overrides default" semantics). + class Substitution + def initialize(mapping) + @mapping = mapping + end + + def apply!(output) + return output if labels_by_value.empty? + + substitute_single_label_values!(output) + substitute_multi_label_values!(output) + output + end + + private + + attr_reader :mapping + + def labels_by_value + @labels_by_value ||= mapping.each_with_object({}) do |(filter, value), hash| + (hash[value] ||= []) << "[#{filter}]" + end + end + + def single_label_values + labels_by_value.select { |_, labels| labels.one? } + end + + def multi_label_values + labels_by_value.reject { |_, labels| labels.one? } + end + + def substitute_single_label_values!(output) + return if single_label_values.empty? + + value_to_label = single_label_values.transform_values(&:first) + output.gsub!(Regexp.union(value_to_label.keys), value_to_label) + end + + def substitute_multi_label_values!(output) + multi_label_values.each do |value, labels| + labels_for_each_occurrence(value, labels, output).each do |label| + output.sub!(value, label) + end + end + end + + def labels_for_each_occurrence(value, labels, output) + occurrences = output.scan(value).size + return labels.last(1) if occurrences.zero? + + chosen = labels.last(occurrences) + chosen + Array.new(occurrences - chosen.size, labels.last) + end + end + end +end diff --git a/spec/top_secret/text_spec.rb b/spec/top_secret/text_spec.rb index cd9ba11..868b4e6 100644 --- a/spec/top_secret/text_spec.rb +++ b/spec/top_secret/text_spec.rb @@ -62,6 +62,78 @@ end end + context "when the same value matches multiple filters" do + let(:austin_person) { build_entity(text: "Austin", tag: :person) } + let(:austin_location) { build_entity(text: "Austin", tag: :location) } + + before do + stub_ner_entities(austin_person, austin_location) + end + + it "labels each occurrence according to the filter that matched it" do + input = "My name is Austin, and I live in Austin TX." + + result = TopSecret::Text.filter(input) + + expect(result.output).to eq( + "My name is [PERSON_1], and I live in [LOCATION_1] TX." + ) + expect(result.mapping).to eq({ + PERSON_1: "Austin", + LOCATION_1: "Austin" + }) + end + end + + context "when the same value appears more times than the filters that matched it" do + let(:austin_person) { build_entity(text: "Austin", tag: :person) } + let(:austin_location) { build_entity(text: "Austin", tag: :location) } + + before do + stub_ner_entities(austin_person, austin_location) + end + + it "redacts every occurrence, falling back to the last matching filter's label" do + input = "Austin met Austin near Austin." + + result = TopSecret::Text.filter(input) + + expect(result.output).to eq( + "[PERSON_1] met [LOCATION_1] near [LOCATION_1]." + ) + end + end + + context "when the same value matches multiple non-NER filters" do + before do + stub_ner_entities + end + + it "labels each occurrence according to the filter that matched it" do + ip_filter = TopSecret::Filters::Regex.new( + label: "IP_ADDRESS", + regex: /\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b/ + ) + server_filter = TopSecret::Filters::Regex.new( + label: "SERVER", + regex: /\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b/ + ) + + result = TopSecret::Text.filter( + "Primary 192.168.1.1, backup 192.168.1.1.", + custom_filters: [ip_filter, server_filter] + ) + + expect(result.output).to eq( + "Primary [IP_ADDRESS_1], backup [SERVER_1]." + ) + expect(result.mapping).to eq({ + IP_ADDRESS_1: "192.168.1.1", + SERVER_1: "192.168.1.1" + }) + end + end + context "when a custom filter matches the same value as a default filter" do it "uses the custom filter's label" do custom_filter = TopSecret::Filters::Regex.new(