Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
## [Unreleased]

### Fixed

- Fixed incorrect labeling when multiple filters match the same token. Each occurrence is now labeled by the filter that matched it (e.g., `"My name is Austin, and I live in Austin TX."` now redacts to `"My name is [PERSON_1], and I live in [LOCATION_1] TX."` instead of labeling both occurrences with the last filter to match).

## [1.0.1] - 2026-04-16

### Fixed
Expand Down
9 changes: 2 additions & 7 deletions lib/top_secret/text.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
require_relative "text/scan_result"
require_relative "text/global_mapping"
require_relative "text/label_sequence"
require_relative "text/substitution"

module TopSecret
# Processes text to identify and redact sensitive information using configured filters.
Expand Down Expand Up @@ -214,13 +215,7 @@ def build_mapping(values, label:)
#
# @return [void]
def substitute_text
return if mapping.empty?

value_to_label = mapping.each_with_object({}) do |(filter, value), hash|
hash[value] = "[#{filter}]"
end
pattern = Regexp.union(value_to_label.keys)
output.gsub!(pattern, value_to_label)
Substitution.new(mapping).apply!(output)
end

# Collects all filters to apply: default filters with overrides plus custom filters
Expand Down
66 changes: 66 additions & 0 deletions lib/top_secret/text/substitution.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# frozen_string_literal: true

module TopSecret
class Text
# Replaces matched values in text with their label placeholders.
#
# When a value is matched by a single filter, every occurrence is replaced
# with that label. When a value is matched by multiple filters, occurrences
# are labeled in filter order; if there are more labels than occurrences,
# the later labels win (preserving "custom filter overrides default" semantics).
class Substitution
def initialize(mapping)
@mapping = mapping
end

def apply!(output)
return output if labels_by_value.empty?

substitute_single_label_values!(output)
substitute_multi_label_values!(output)
output
end

private

attr_reader :mapping

def labels_by_value
@labels_by_value ||= mapping.each_with_object({}) do |(filter, value), hash|
(hash[value] ||= []) << "[#{filter}]"
end
end

def single_label_values
labels_by_value.select { |_, labels| labels.one? }
end

def multi_label_values
labels_by_value.reject { |_, labels| labels.one? }
end

def substitute_single_label_values!(output)
return if single_label_values.empty?

value_to_label = single_label_values.transform_values(&:first)
output.gsub!(Regexp.union(value_to_label.keys), value_to_label)
end

def substitute_multi_label_values!(output)
multi_label_values.each do |value, labels|
labels_for_each_occurrence(value, labels, output).each do |label|
output.sub!(value, label)
end
end
end

def labels_for_each_occurrence(value, labels, output)
occurrences = output.scan(value).size
Comment thread
stevepolitodesign marked this conversation as resolved.
return labels.last(1) if occurrences.zero?

chosen = labels.last(occurrences)
chosen + Array.new(occurrences - chosen.size, labels.last)
end
end
end
end
72 changes: 72 additions & 0 deletions spec/top_secret/text_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,78 @@
end
end

context "when the same value matches multiple filters" do
let(:austin_person) { build_entity(text: "Austin", tag: :person) }
let(:austin_location) { build_entity(text: "Austin", tag: :location) }

before do
stub_ner_entities(austin_person, austin_location)
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I should check to see what happens if we swap the order:

Suggested change
stub_ner_entities(austin_person, austin_location)
stub_ner_entities(austin_location, austin_person)

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What does _ner_ mean in the method name?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Named entity recognition. I learned about it from using MITIE Ruby.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it! You've probably explained it before, which makes me think, could this method use the full name instead of the acronym so it's more accessible to the reader?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this case I wanted to be consistent with how MITIE Ruby names things, since it has a Mitie::NER class.

end

it "labels each occurrence according to the filter that matched it" do
input = "My name is Austin, and I live in Austin TX."

result = TopSecret::Text.filter(input)

expect(result.output).to eq(
"My name is [PERSON_1], and I live in [LOCATION_1] TX."
)
expect(result.mapping).to eq({
PERSON_1: "Austin",
LOCATION_1: "Austin"
})
end
Comment thread
stevepolitodesign marked this conversation as resolved.
end

context "when the same value appears more times than the filters that matched it" do
let(:austin_person) { build_entity(text: "Austin", tag: :person) }
let(:austin_location) { build_entity(text: "Austin", tag: :location) }

before do
stub_ner_entities(austin_person, austin_location)
end

it "redacts every occurrence, falling back to the last matching filter's label" do
input = "Austin met Austin near Austin."

result = TopSecret::Text.filter(input)

expect(result.output).to eq(
"[PERSON_1] met [LOCATION_1] near [LOCATION_1]."
)
end
end

context "when the same value matches multiple non-NER filters" do
before do
stub_ner_entities
end

it "labels each occurrence according to the filter that matched it" do
ip_filter = TopSecret::Filters::Regex.new(
label: "IP_ADDRESS",
regex: /\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b/
)
server_filter = TopSecret::Filters::Regex.new(
label: "SERVER",
regex: /\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b/
)
Comment on lines +113 to +120
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To make it clearer that these are identical expressions, what do you think of this?

Suggested change
ip_filter = TopSecret::Filters::Regex.new(
label: "IP_ADDRESS",
regex: /\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b/
)
server_filter = TopSecret::Filters::Regex.new(
label: "SERVER",
regex: /\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b/
)
regex = /\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b/
ip_filter = TopSecret::Filters::Regex.new(
label: "IP_ADDRESS",
regex:
)
server_filter = TopSecret::Filters::Regex.new(
label: "SERVER",
regex:
)


result = TopSecret::Text.filter(
"Primary 192.168.1.1, backup 192.168.1.1.",
custom_filters: [ip_filter, server_filter]
)
Comment on lines +122 to +125
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here.

Suggested change
result = TopSecret::Text.filter(
"Primary 192.168.1.1, backup 192.168.1.1.",
custom_filters: [ip_filter, server_filter]
)
result = TopSecret::Text.filter(
"Primary 192.168.1.1, backup 192.168.1.1.",
custom_filters: [server_filter, ip_filter]
)


expect(result.output).to eq(
"Primary [IP_ADDRESS_1], backup [SERVER_1]."
)
expect(result.mapping).to eq({
IP_ADDRESS_1: "192.168.1.1",
SERVER_1: "192.168.1.1"
})
end
end

context "when a custom filter matches the same value as a default filter" do
it "uses the custom filter's label" do
custom_filter = TopSecret::Filters::Regex.new(
Expand Down