Skip to content

Commit 467eac5

Browse files
eolivelliclaude
andcommitted
Address review: snapshot clientCtx and guard timeoutQuorumWait against recycle
Per @merlimat's review on #4760, the original null-check in maybeTimeout() still races: clientCtx may be nulled between the guard and the subsequent getConf() dereference. Volatile only addresses visibility, not atomicity across two reads. - Snapshot clientCtx into a local in maybeTimeout(); the local cannot be mutated by another thread, so the dereference is race-free. - Drop the volatile modifier on clientCtx (no longer load-bearing once the read is done once into a local). - Extend timeoutQuorumWait()'s early-return to also short-circuit when lh / clientCtx are null. recyclePendAddOpObject() resets `completed` to false, so the prior `if (completed)` guard does not cover the already-recycled case; without this check timeoutQuorumWait() NPEs on lh.getLedgerMetadata() (and worse, could invoke handleUnrecoverableErrorDuringAdd on a stale handle). - Add testTimeoutQuorumWaitIsNoOpWhenAlreadyRecycled covering the new guard. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent dbdf559 commit 467eac5

2 files changed

Lines changed: 46 additions & 7 deletions

File tree

bookkeeper-server/src/main/java/org/apache/bookkeeper/client/PendingAddOp.java

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ class PendingAddOp implements WriteCallback {
6666
boolean completed = false;
6767

6868
LedgerHandle lh;
69-
volatile ClientContext clientCtx;
69+
ClientContext clientCtx;
7070
boolean isRecoveryAdd = false;
7171
volatile long requestTimeNanos;
7272
long qwcLatency; // Quorum Write Completion Latency after response from quorum bookies.
@@ -154,21 +154,28 @@ private void sendWriteRequest(List<BookieId> ensemble, int bookieIndex) {
154154
}
155155

156156
boolean maybeTimeout() {
157-
if (clientCtx == null) {
158-
// Op has already been recycled: recyclePendAddOpObject() cleared clientCtx while
159-
// monitorPendingAddOps() was still holding a reference to it from the iterator.
160-
// The add-entry completed before the timeout monitor fired; nothing to time out.
157+
// Snapshot clientCtx into a local: recyclePendAddOpObject() may run on another thread
158+
// and null the field while monitorPendingAddOps() still holds an iterator reference
159+
// to this op. A single read prevents the field from going null between the guard and
160+
// the getConf() dereference below.
161+
ClientContext ctx = clientCtx;
162+
if (ctx == null) {
163+
// Already recycled — the add-entry completed before the timeout monitor fired.
161164
return false;
162165
}
163-
if (MathUtils.elapsedNanos(requestTimeNanos) >= clientCtx.getConf().addEntryQuorumTimeoutNanos) {
166+
if (MathUtils.elapsedNanos(requestTimeNanos) >= ctx.getConf().addEntryQuorumTimeoutNanos) {
164167
timeoutQuorumWait();
165168
return true;
166169
}
167170
return false;
168171
}
169172

170173
synchronized void timeoutQuorumWait() {
171-
if (completed) {
174+
// lh / clientCtx are nulled by recyclePendAddOpObject() under this same monitor.
175+
// Once we hold the lock, a recycle has either not started or fully completed; if any
176+
// of these are null, the op has been recycled and there is nothing to time out.
177+
// (The completed flag alone is insufficient: recycle resets it to false.)
178+
if (completed || lh == null || clientCtx == null) {
172179
return;
173180
}
174181

bookkeeper-server/src/test/java/org/apache/bookkeeper/client/PendingAddOpTest.java

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,10 @@
2424
import static org.junit.Assert.assertNull;
2525
import static org.junit.Assert.assertSame;
2626
import static org.junit.Assert.assertTrue;
27+
import static org.mockito.ArgumentMatchers.anyInt;
2728
import static org.mockito.Mockito.mock;
29+
import static org.mockito.Mockito.never;
30+
import static org.mockito.Mockito.verify;
2831
import static org.mockito.Mockito.when;
2932

3033
import io.netty.buffer.ByteBuf;
@@ -165,4 +168,33 @@ public void testMaybeTimeoutReturnsTrueWhenQuorumTimeoutExpired() {
165168

166169
assertTrue(op.maybeTimeout());
167170
}
171+
172+
/**
173+
* Verify that {@link PendingAddOp#timeoutQuorumWait()} is a no-op when the op has already
174+
* been recycled (i.e. {@code lh} and {@code clientCtx} are {@code null}).
175+
*
176+
* <p>This covers the second concern raised in the review of #4760: even after
177+
* {@link PendingAddOp#maybeTimeout()} captures a non-null {@code clientCtx} snapshot,
178+
* {@code recyclePendAddOpObject()} may complete before {@code timeoutQuorumWait()}
179+
* acquires the monitor. The pre-existing {@code if (completed) return;} guard does not
180+
* cover this because recycling resets {@code completed} to {@code false}; without an
181+
* explicit null check the method NPEs on {@code lh.getLedgerMetadata()} (or similar)
182+
* and, worse, may invoke {@code handleUnrecoverableErrorDuringAdd} on a stale handle.
183+
*/
184+
@Test
185+
public void testTimeoutQuorumWaitIsNoOpWhenAlreadyRecycled() {
186+
PendingAddOp op = PendingAddOp.create(
187+
lh, mockClientContext, lh.getCurrentEnsemble(),
188+
payload, WriteFlag.NONE,
189+
(rc, handle, entryId, qwcLatency, ctx) -> {}, null);
190+
191+
// Simulate post-recycle state: recyclePendAddOpObject() has already nulled these fields.
192+
op.lh = null;
193+
op.clientCtx = null;
194+
195+
// Must not throw NPE — and must not invoke the unrecoverable-error handler on the
196+
// (now-stale) ledger handle, which would cause spurious failures on a recycled op.
197+
op.timeoutQuorumWait();
198+
verify(lh, never()).handleUnrecoverableErrorDuringAdd(anyInt());
199+
}
168200
}

0 commit comments

Comments
 (0)