Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion sotodlib/preprocess/processes.py
Original file line number Diff line number Diff line change
Expand Up @@ -2609,8 +2609,10 @@ def process(self, aman, proc_aman, sim=False, data_aman=None):
T_signal=data_aman.dsT
)
else:
pcfg = copy.deepcopy(self.process_cfgs)
pcfg.pop("fit_in_freq", None)
tod_ops.t2pleakage.subtract_t2p(aman, proc_aman['t2p'],
**self.process_cfgs)
**pcfg)
return aman, proc_aman

class SplitFlags(_Preprocess):
Expand Down
188 changes: 124 additions & 64 deletions sotodlib/site_pipeline/jobdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@
Delete some jobs::

for j in jdb.get_jobs(jclass='my_analysis'):
jdb.remove_job(j.id)
jdb.remove_jobs(j.id)


"""
Expand Down Expand Up @@ -304,55 +304,103 @@ def get_jobs(self,
[session.expunge(j) for j in jobs]
return jobs

def lock(self, job_id, owner=None, force=False):
"""Lock a Job record by id. If the Job is already locked, a
JobLockedError is raised.
def lock(self, job_ids, owner=None, force=False):
"""Lock one or more Jobs record by id. If a Job is already locked,
a JobLockedError is raised.

Returns a Job object that has been expunged from the database
session. The object attributes can be modified, but won't be
written back to the database unless the object is merged into
a new session.
Returns the Job object(s) that has been expunged from the database
session. If a single job id is input one Job will be returned,
otherwise a list of Jobs will be returned. The object attributes can
be modified, but won't be written back to the database unless the
object is merged into a new session.

"""
if owner is None:
owner = self._lockstr()

now = time.time()

if isinstance(job_ids, int):
job_ids = [job_ids]

job_ids = [(j.id if isinstance(j, Job) else j) for j in job_ids]
Comment on lines +323 to +326

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Either outlaw passing in Job(s) entirely, or accept that a user might pass in a single Job; e.g. isinstance(job_ids, (int, Job))


with self.session_scope() as session:
q = session.query(Job)
if force:
q = q.filter(sqy.and_(Job.id == job_id))
else:
q = q.filter(sqy.and_(Job.id == job_id,
Job.lock == None)) # noqa: E711
n = q.update({Job.lock: time.time(), Job.lock_owner: owner})
q = session.query(Job).filter(Job.id.in_(job_ids))

if not force:
q = q.filter(Job.lock == None)

n = q.update(
{Job.lock: now, Job.lock_owner: owner},
synchronize_session=False
)

session.commit()

with self.session_scope() as session:
job = session.get(Job, job_id)
session.expunge(job)
jobs = (
session.query(Job)
.filter(Job.id.in_(job_ids))
.all()
)

for job in jobs:
session.expunge(job)

if n == 0 or job.lock_owner != owner:
locked_jobs = [j for j in jobs if j.lock_owner == owner]

if n !=len(job_ids) or len(locked_jobs) != len(job_ids):
raise JobLockedError()
Comment on lines +353 to 354

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This behavior should be revisited -- as implemented here, if one of the many jobs is locked already, the code will lock them all and then raise an error. This is a sure-fire way for a small problem of a stray locked row to compound into multiple stray locked rows.

Reasonable behaviors:

  • best effort -- lock as many jobs as possible: lock and return only those ones (even if empty set) without raising any errors.
  • al-or-none; attempt lock of all jobs, but if any are already locked, then do not lock any of them (i.e. rollback()) and then raise the error.


return job
return locked_jobs[0] if len(locked_jobs) == 1 else locked_jobs

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As written, if I pass in [job_id] (a list) I will get back a scalar. That will cause trouble in some cases; better to capture scalarness in a variable at the top of this function, and use that to decide whether to unpack the list in the return.


def unlock(self, jobs, merge=True):
"""Unlock one or more jobs."""
if not isinstance(jobs, (list, tuple, set)):
jobs = [jobs]
Comment on lines +360 to +361

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Make this check the same as what is done in lock (or make lock the same as here).


job_ids = []
job_objs = []

for j in jobs:
if isinstance(j, Job):
job_ids.append(j.id)
job_objs.append(j)
else:
job_ids.append(j)

def unlock(self, job, merge=True):
if not merge or isinstance(job, int):
if isinstance(job, Job):
job = job.id
if not merge or not job_objs:
with self.session_scope() as session:
session.query(Job).filter(Job.id == job).update(
{Job.lock: None, Job.lock_owner: None})
session.query(Job).filter(Job.id.in_(job_ids)).update(
{Job.lock: None, Job.lock_owner: None},
synchronize_session=False
)
session.commit()

else:
with self.session_scope() as session:
j1 = session.query(Job).filter(Job.id == job.id).one()
if j1.lock_owner is None:
raise JobNotLockedError()
if j1.lock_owner != job.lock_owner:
raise JobNotOwnedError()
job.lock = None
job.lock_owner = None
session.merge(job)
db_jobs = (
session.query(Job)
.filter(Job.id.in_(job_ids))
.all()
)

db_map = {j.id: j for j in db_jobs}

for job in job_objs:
j1 = db_map[job.id]

if j1.lock_owner is None:
raise JobNotLockedError()

if j1.lock_owner != job.lock_owner:
raise JobNotOwnedError()
Comment on lines +394 to +398

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As above, error handling in the multiple-jobs should be done with some care. Perhaps on failure, you make a note and then try the next job? Then raise if any failures?

In a multi-job case it would be helpful to provide traceback about what job failed user's assumptions here.


job.lock = None
job.lock_owner = None
session.merge(job)

session.commit()

def clear_locks(self, jobs=None):
Expand All @@ -368,26 +416,30 @@ def clear_locks(self, jobs=None):
q = q.filter(Job.id == j)
q.update({Job.lock: None, Job.lock_owner: None})

def remove_job(self, job_id, check_locked=False):
def remove_jobs(self, job_ids, check_locked=False):
if isinstance(job_ids, (int, Job)):
job_ids = [job_ids]

job_ids = [j.id if isinstance(j, Job) else j for j in job_ids]

with self.session_scope() as session:
q = session.query(Job).filter(Job.id.in_(job_ids))

if check_locked:
q = session.query(Job).filter(
sqy.and_(Job.id == job_id,
Job.lock == None)) # noqa: E711
else:
q = session.query(Job).filter(Job.id == job_id)
q = q.filter(Job.lock == None) # noqa: E711

n = q.delete()
n = q.delete(synchronize_session=False)
session.commit()
if n == 0:

if n != len(job_ids):
raise JobLockedError()

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe a custom error indicating that "only n of m jobs were successfully deleted"?


@contextmanager
def locked(self, jobs, count=None, owner=None):
"""Context Manager to grant exclusive access to one or more
Job. Job record is marked as locked, and this process may
freely work on the job and alter the job data. When execution
leaves the context, the Job will be marked as unlocked. Note
Jobs. Job records are marked as locked, and this process may
freely work on the jobs and alter the job data. When execution
leaves the context, the Jobs will be marked as unlocked. Note
the _database_ is only explicitly locked while this lock is
being acquired and released. In between, other entities can
do other database stuff.
Expand All @@ -411,30 +463,38 @@ def locked(self, jobs, count=None, owner=None):
"""
if owner is None:
owner = self._lockstr()

if isinstance(jobs, (int, Job)):
jobs = [jobs]

job_ids = [j.id if isinstance(j, Job) else j for j in jobs]

locked = []
for job in jobs:
if len(locked) >= (1 if count is None else count):
break
if isinstance(job, Job):
job = job.id
try:
j = self.lock(job)
except JobLockedError:
continue
locked.append(j)

try:
if count is None:
if len(locked):
yield locked[0]
else:
yield None
else:
yield locked
with self.session_scope() as session:
unlocked_ids = {
j.id for j in session.query(Job.id)
.filter(Job.id.in_(job_ids), Job.lock == None)
}

selected = []
limit = count if count is not None else 1

for jid in job_ids:
if jid in unlocked_ids:
selected.append(jid)
if len(selected) >= limit:
break

if selected:
locked = self.lock(selected, owner=owner)
Comment on lines +484 to +491

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(Depending on how self.lock() error handling is modified --) There's a race condition here that will not necessarily produce count newly locked items. Although it will be slow on slow databases, you probably need a loop here to keep locking items until you reach count or run out of candidates.

There may be atomic one-liners to do this sort of thing; the original code didn't optimize for that (partly because I don't know sqlalchemy well enough). But consider something like:
update jobs set locked={timestamp}, owner={owner} where job_id in (select job_id from jobs where locked=null limit {count}).


yield locked

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Needs special handling to yield a single job if count is None.


finally:
for j in locked:
self.unlock(j)
if locked:
self.unlock(locked)

def get_resource(self, jclass, n=None, jstate='open', tags={}):
jobs = self.get_jobs(jclass, jstate=jstate, tags=tags)
Expand Down Expand Up @@ -552,7 +612,7 @@ def cli(args=None):
elif args.action == 'delete':
print('Removing jobs ...')
for j in jobs:
jdb.remove_job(j)
jdb.remove_jobs(j)
elif args.action.startswith('set-'):
for k in ['open', 'done', 'failed', 'ignored']:
if args.action == f'set-{k}':
Expand Down
69 changes: 55 additions & 14 deletions sotodlib/site_pipeline/multilayer_preprocess_tod.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ def _main(executor: Union["MPICommExecutor", "ProcessPoolExecutor"],
exist_ok=True)

# jobdb
jobdb_path = configs_proc.get("jobdb", None)
jobdb_path = configs_proc.get("jobdb", {}).get("path")
if jobdb_path is not None:
jdb = JobManager(sqlite_file=jobdb_path)
# get init jobs
Expand Down Expand Up @@ -406,6 +406,25 @@ def _main(executor: Union["MPICommExecutor", "ProcessPoolExecutor"],
batch_size_init = configs_init['archive'].get('batch_size', 1)
batch_size_proc = configs_proc['archive'].get('batch_size', 1)

if jobdb_path is not None:
# batch updates to JobDb
jdb_batch_size = configs_proc['jobdb'].get('batch_size', 1)
batched_job_count = 0
batched_job_fields = []

# get jobs up front since it is faster
init_jobs = jdb.get_jobs(jclass="init", jstate=JState.open)
tags_to_job_init = {
frozenset({k: v for k, v in j.tags.items() if k != 'error'}.items()): j
for j in init_jobs
}

proc_jobs = jdb.get_jobs(jclass="proc", jstate=JState.open)
tags_to_job_proc = {
frozenset({k: v for k, v in j.tags.items() if k != 'error'}.items()): j
for j in proc_jobs
}

pb_name = os.path.join(pb_path or '', f"pb_{str(int(time.time()))}.txt")
with open(pb_name, 'w') as f:
with MultiDbBatchManager(
Expand Down Expand Up @@ -441,26 +460,48 @@ def _main(executor: Union["MPICommExecutor", "ProcessPoolExecutor"],
configs_proc, logger, overwrite,
db_manager=db_mgr_proc)


# update jobdb
if jobdb_path is not None:
tags = {}
tags["obs:obs_id"] = obs_id
for gb, g in zip(group_by, group):
tags['dets:' + gb] = g
jobs = jdb.get_jobs(jstate=JState.open, tags=tags)
# get both init and proc
init_job = tags_to_job_init.get(frozenset(tags.items()), None)
proc_job = tags_to_job_proc.get(frozenset(tags.items()), None)

if errors[0] is not None:
jstate = JState.failed
jerror = errors[0]
else:
jstate = JState.done
jerror = None

for job in [init_job, proc_job]:
if job is not None:
batched_job_fields.append(
{
"job": job,
"error": jerror,
"jstate": jstate,
}
)

batched_job_count += 1

if (batched_job_count >= jdb_batch_size) or (len(futures) == 0):
jobs = [j['job'] for j in batched_job_fields]
with jdb.locked(jobs, count=len(jobs)) as j:
for job_idx, job in enumerate(j):
job.mark_visited()
job.jstate = batched_job_fields[job_idx]["jstate"]
for _t in job._tags:
if _t.key == "error":
_t.value = batched_job_fields[job_idx]["error"]
batched_job_count = 0
batched_job_fields = []

for job in jobs:
# init layer state will be JState.done if already run
if job.jstate == JState.open:
with jdb.locked(job) as j:
j.mark_visited()
if errors[0] is not None:
j.jstate = JState.failed
for _t in j._tags:
if _t.key == "error":
_t.value = errors[0]
else:
j.jstate = JState.done
if raise_error:
n_obs_fail = 0
n_groups_fail = 0
Expand Down
Loading