From 841f5836b798cb830f38b90662d11d07feb88368 Mon Sep 17 00:00:00 2001 From: voetberg Date: Wed, 18 Mar 2026 09:44:00 -0500 Subject: [PATCH 1/6] Common: Update hashbang to python3 Issue: probes#127 --- common/check_stuck_rules | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/common/check_stuck_rules b/common/check_stuck_rules index 84633ba6..b47492bc 100755 --- a/common/check_stuck_rules +++ b/common/check_stuck_rules @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # Copyright European Organization for Nuclear Research (CERN) 2013 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,7 +13,6 @@ """ Probe to check the backlog of stuck rules. """ -from __future__ import print_function import sys import traceback From eb8ff1dccb37b2060e9f84e3ae7d10fe67a5e834 Mon Sep 17 00:00:00 2001 From: voetberg Date: Wed, 18 Mar 2026 09:49:20 -0500 Subject: [PATCH 2/6] Common: Update to SQLA2.0 Issue: probes#127 --- common/check_stuck_rules | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/common/check_stuck_rules b/common/check_stuck_rules index b47492bc..c2c89ea7 100755 --- a/common/check_stuck_rules +++ b/common/check_stuck_rules @@ -18,18 +18,17 @@ import sys import traceback from prometheus_client import CollectorRegistry, Gauge, push_to_gateway +from sqlalchemy.sql import and_, func, null, or_, select + from rucio.common.config import config_get -from rucio.db.sqla.session import BASE, get_session +from rucio.db.sqla import models +from rucio.db.sqla.session import get_session from utils.common import probe_metrics # Exit statuses OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 -if BASE.metadata.schema: - schema = BASE.metadata.schema + '.' -else: - schema = '' PROM_SERVERS = config_get('monitor', 'prometheus_servers', raise_exception=False, default='') if PROM_SERVERS != '': @@ -39,14 +38,33 @@ if __name__ == "__main__": try: registry = CollectorRegistry() session = get_session() - sql = 'SELECT COUNT(1) FROM {schema}RULES where state=\'S\' and (error !=\'MissingSourceReplica\' or error IS NULL)'.format( - schema=schema) + sql = select( + func.count() + ).select_from( + models.ReplicationRule + ).where( + and_( + models.ReplicationRule.state == "S", + or_( + models.ReplicationRule.error != "MissingSourceReplica", + models.ReplicationRule.error == null() + ) + ) + ) result = session.execute(sql).fetchone()[0] probe_metrics.gauge(name='judge.stuck_rules_without_missing_source_replica').set(result) Gauge('judge_stuck_rules_without_missing_source_replica', '', registry=registry).set(result) - sql = 'SELECT COUNT(1) FROM {schema}RULES where state=\'S\' and error =\'MissingSourceReplica\''.format( - schema=schema) + sql = select( + func.count() + ).select_from( + models.ReplicationRule + ).where( + and_( + models.ReplicationRule.state == "S", + models.ReplicationRule.error == "MissingSourceReplica" + ) + ) result = session.execute(sql).fetchone()[0] probe_metrics.gauge(name='judge.stuck_rules_with_missing_source_replica').set(result) Gauge('judge_stuck_rules_with_missing_source_replica', '', registry=registry).set(result) From 71f3cbb052e05eecb252ab6711dd4af4a4e5ec5b Mon Sep 17 00:00:00 2001 From: voetberg Date: Wed, 18 Mar 2026 09:54:57 -0500 Subject: [PATCH 3/6] Common: Update copyright header Issue: probes#127 --- common/check_stuck_rules | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/common/check_stuck_rules b/common/check_stuck_rules index c2c89ea7..e40f1f6f 100755 --- a/common/check_stuck_rules +++ b/common/check_stuck_rules @@ -1,14 +1,17 @@ #!/usr/bin/env python3 -# Copyright European Organization for Nuclear Research (CERN) 2013 +# Copyright European Organization for Nuclear Research (CERN) since 2012 # # Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at # -# Authors: -# - Martin Barisits, , 2014 -# - Eric Vaandering, , 2019-2021 -# - Thomas Beermann, , 2019 +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ Probe to check the backlog of stuck rules. From f898817a51d2c015e917f5be04e856e1178f939b Mon Sep 17 00:00:00 2001 From: voetberg Date: Wed, 18 Mar 2026 09:56:52 -0500 Subject: [PATCH 4/6] Common: Change to using 'scalar_one' over 'fetchone' Issue: probes#127 --- common/check_stuck_rules | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/check_stuck_rules b/common/check_stuck_rules index e40f1f6f..a60d2db5 100755 --- a/common/check_stuck_rules +++ b/common/check_stuck_rules @@ -54,7 +54,7 @@ if __name__ == "__main__": ) ) ) - result = session.execute(sql).fetchone()[0] + result = session.execute(sql).scalar_one() probe_metrics.gauge(name='judge.stuck_rules_without_missing_source_replica').set(result) Gauge('judge_stuck_rules_without_missing_source_replica', '', registry=registry).set(result) @@ -68,7 +68,7 @@ if __name__ == "__main__": models.ReplicationRule.error == "MissingSourceReplica" ) ) - result = session.execute(sql).fetchone()[0] + result = session.execute(sql).scalar_one() probe_metrics.gauge(name='judge.stuck_rules_with_missing_source_replica').set(result) Gauge('judge_stuck_rules_with_missing_source_replica', '', registry=registry).set(result) From 94dc1ea1069040eec9fae1493fcb032b27359ea8 Mon Sep 17 00:00:00 2001 From: voetberg Date: Wed, 18 Mar 2026 09:58:19 -0500 Subject: [PATCH 5/6] Common: Rename queries Issue: probes#127 --- common/check_stuck_rules | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/common/check_stuck_rules b/common/check_stuck_rules index a60d2db5..c2251589 100755 --- a/common/check_stuck_rules +++ b/common/check_stuck_rules @@ -41,7 +41,7 @@ if __name__ == "__main__": try: registry = CollectorRegistry() session = get_session() - sql = select( + without_missing_stmt = select( func.count() ).select_from( models.ReplicationRule @@ -54,11 +54,11 @@ if __name__ == "__main__": ) ) ) - result = session.execute(sql).scalar_one() + result = session.execute(without_missing_stmt).scalar_one() probe_metrics.gauge(name='judge.stuck_rules_without_missing_source_replica').set(result) Gauge('judge_stuck_rules_without_missing_source_replica', '', registry=registry).set(result) - sql = select( + with_missing_stmt = select( func.count() ).select_from( models.ReplicationRule @@ -68,7 +68,7 @@ if __name__ == "__main__": models.ReplicationRule.error == "MissingSourceReplica" ) ) - result = session.execute(sql).scalar_one() + result = session.execute(with_missing_stmt).scalar_one() probe_metrics.gauge(name='judge.stuck_rules_with_missing_source_replica').set(result) Gauge('judge_stuck_rules_with_missing_source_replica', '', registry=registry).set(result) From ac4a5447c6e9f7e36fad996791552ea45b7b8d06 Mon Sep 17 00:00:00 2001 From: voetberg Date: Wed, 18 Mar 2026 10:12:05 -0500 Subject: [PATCH 6/6] Common: Change to updated PrometheusPusher manager Issue: probes#127 --- common/check_stuck_rules | 66 ++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 40 deletions(-) diff --git a/common/check_stuck_rules b/common/check_stuck_rules index c2251589..efab6bd4 100755 --- a/common/check_stuck_rules +++ b/common/check_stuck_rules @@ -20,64 +20,50 @@ Probe to check the backlog of stuck rules. import sys import traceback -from prometheus_client import CollectorRegistry, Gauge, push_to_gateway from sqlalchemy.sql import and_, func, null, or_, select -from rucio.common.config import config_get from rucio.db.sqla import models from rucio.db.sqla.session import get_session -from utils.common import probe_metrics +from utils.common import PrometheusPusher # Exit statuses OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 -PROM_SERVERS = config_get('monitor', 'prometheus_servers', raise_exception=False, default='') -if PROM_SERVERS != '': - PROM_SERVERS = PROM_SERVERS.split(',') - if __name__ == "__main__": try: - registry = CollectorRegistry() session = get_session() - without_missing_stmt = select( - func.count() - ).select_from( - models.ReplicationRule - ).where( - and_( - models.ReplicationRule.state == "S", - or_( - models.ReplicationRule.error != "MissingSourceReplica", - models.ReplicationRule.error == null() + with PrometheusPusher(job_name='check_stuck_rules') as manager: + without_missing_stmt = select( + func.count() + ).select_from( + models.ReplicationRule + ).where( + and_( + models.ReplicationRule.state == "S", + or_( + models.ReplicationRule.error != "MissingSourceReplica", + models.ReplicationRule.error == null() + ) ) ) - ) - result = session.execute(without_missing_stmt).scalar_one() - probe_metrics.gauge(name='judge.stuck_rules_without_missing_source_replica').set(result) - Gauge('judge_stuck_rules_without_missing_source_replica', '', registry=registry).set(result) + result = session.execute(without_missing_stmt).scalar_one() + manager.gauge(name='judge.stuck_rules_without_missing_source_replica').set(result) - with_missing_stmt = select( - func.count() - ).select_from( - models.ReplicationRule - ).where( - and_( - models.ReplicationRule.state == "S", - models.ReplicationRule.error == "MissingSourceReplica" + with_missing_stmt = select( + func.count() + ).select_from( + models.ReplicationRule + ).where( + and_( + models.ReplicationRule.state == "S", + models.ReplicationRule.error == "MissingSourceReplica" + ) ) - ) - result = session.execute(with_missing_stmt).scalar_one() - probe_metrics.gauge(name='judge.stuck_rules_with_missing_source_replica').set(result) - Gauge('judge_stuck_rules_with_missing_source_replica', '', registry=registry).set(result) + result = session.execute(with_missing_stmt).scalar_one() + manager.gauge(name='judge.stuck_rules_with_missing_source_replica').set(result) - if len(PROM_SERVERS): - for server in PROM_SERVERS: - try: - push_to_gateway(server.strip(), job='check_stuck_rules', registry=registry) - except: - continue except: print(traceback.format_exc()) sys.exit(UNKNOWN)