Skip to content

Commit 1940dfb

Browse files
authored
Automatically retry the build if encountered remote cache eviction error (#18171)
With #17358, Bazel will exit with code 39 if remote cache evicts blobs during the build. With #17462 and #17747, Bazel is able to continue the build without bazel clean or bazel shutdown. However, even with #17639 and following changes to extend the lease, remote cache can still evict blobs in some rare cases. Based on above changes, this PR makes bazel retry the invocation if it encountered the remote cache eviction error during previous invocation if `--experimental_remote_cache_eviction_retries` is set, or **build rewinding**. ``` $ bazel build --experimental_remote_cache_eviction_retries=5 ... INFO: Invocation ID: b7348bfa-9446-4c72-a888-0a0ad012f225 Loading: Loading: Loading: 0 packages loaded Analyzing: target //a:bar (0 packages loaded, 0 targets configured) INFO: Analyzed target //a:bar (0 packages loaded, 0 targets configured). INFO: Found 1 target... [0 / 2] [Prepa] BazelWorkspaceStatusAction stable-status.txt ERROR: .../workspace/a/BUILD:8:8: Executing genrule //a:bar failed: Failed to fetch blobs because they do not exist remotely: Missing digest: b5bb9d8014a0f9b1d61e21e796d78dccdf1352f23cd32812f4850b878ae4944c/4 Target //a:bar failed to build Use --verbose_failures to see the command lines of failed build steps. INFO: Elapsed time: 0.447s, Critical Path: 0.05s INFO: 2 processes: 2 internal. ERROR: Build did NOT complete successfully Found remote cache eviction error, retrying the build... INFO: Invocation ID: 983f60dc-8bb9-4b82-aa33-a378469ce140 Loading: Loading: Loading: 0 packages loaded Analyzing: target //a:bar (0 packages loaded, 0 targets configured) INFO: Analyzed target //a:bar (0 packages loaded, 0 targets configured). INFO: Found 1 target... [0 / 2] [Prepa] BazelWorkspaceStatusAction stable-status.txt Target //a:bar up-to-date: bazel-bin/a/bar.out INFO: Elapsed time: 0.866s, Critical Path: 0.35s INFO: 3 processes: 1 internal, 1 processwrapper-sandbox, 1 remote. INFO: Build completed successfully, 3 total actions $ ``` Part of #16660. Closes #17711. PiperOrigin-RevId: 520610524 Change-Id: I20d43d1968767a03250b9c8f8a6dda4e056d4f52
1 parent 5afb8b6 commit 1940dfb

File tree

7 files changed

+119
-23
lines changed

7 files changed

+119
-23
lines changed

src/main/java/com/google/devtools/build/lib/exec/ExecutionOptions.java

+11
Original file line numberDiff line numberDiff line change
@@ -494,6 +494,17 @@ public boolean usingLocalTestJobs() {
494494
+ "test log. Otherwise, Bazel generates a test.xml as part of the test action.")
495495
public boolean splitXmlGeneration;
496496

497+
@Option(
498+
name = "experimental_remote_cache_eviction_retries",
499+
defaultValue = "0",
500+
documentationCategory = OptionDocumentationCategory.REMOTE,
501+
effectTags = {OptionEffectTag.EXECUTION},
502+
help =
503+
"The maximum number of attempts to retry if the build encountered remote cache eviction"
504+
+ " error. A non-zero value will implicitly set"
505+
+ " --incompatible_remote_use_new_exit_code_for_lost_inputs to true.")
506+
public int remoteRetryOnCacheEviction;
507+
497508
/** An enum for specifying different formats of test output. */
498509
public enum TestOutputFormat {
499510
SUMMARY, // Provide summary output only.

src/main/java/com/google/devtools/build/lib/remote/RemoteActionInputFetcher.java

+1-4
Original file line numberDiff line numberDiff line change
@@ -158,10 +158,7 @@ protected Completable onErrorResumeNext(Throwable error) {
158158
new EnvironmentalExecException(
159159
(BulkTransferException) error,
160160
FailureDetail.newBuilder()
161-
.setMessage(
162-
"Failed to fetch blobs because they do not exist remotely."
163-
+ " Build without the Bytes does not work if your remote"
164-
+ " cache evicts blobs during builds")
161+
.setMessage("Failed to fetch blobs because they do not exist remotely.")
165162
.setSpawn(FailureDetails.Spawn.newBuilder().setCode(code))
166163
.build());
167164
}

src/main/java/com/google/devtools/build/lib/remote/RemoteModule.java

+3-1
Original file line numberDiff line numberDiff line change
@@ -907,6 +907,7 @@ public void executorInit(CommandEnvironment env, BuildRequest request, ExecutorB
907907

908908
actionContextProvider.setTempPathGenerator(tempPathGenerator);
909909

910+
ExecutionOptions executionOptions = env.getOptions().getOptions(ExecutionOptions.class);
910911
RemoteOptions remoteOptions =
911912
Preconditions.checkNotNull(
912913
env.getOptions().getOptions(RemoteOptions.class), "RemoteOptions");
@@ -929,7 +930,8 @@ public void executorInit(CommandEnvironment env, BuildRequest request, ExecutorB
929930
tempPathGenerator,
930931
patternsToDownload,
931932
outputPermissions,
932-
remoteOptions.useNewExitCodeForLostInputs);
933+
remoteOptions.useNewExitCodeForLostInputs
934+
|| (executionOptions != null && executionOptions.remoteRetryOnCacheEviction > 0));
933935
env.getEventBus().register(actionInputFetcher);
934936
builder.setActionInputPrefetcher(actionInputFetcher);
935937
actionContextProvider.setActionInputFetcher(actionInputFetcher);

src/main/java/com/google/devtools/build/lib/remote/RemoteSpawnRunner.java

+2-1
Original file line numberDiff line numberDiff line change
@@ -557,7 +557,8 @@ private SpawnResult handleError(
557557
catastrophe = true;
558558
} else if (remoteCacheFailed) {
559559
status = Status.REMOTE_CACHE_FAILED;
560-
if (remoteOptions.useNewExitCodeForLostInputs) {
560+
if (remoteOptions.useNewExitCodeForLostInputs
561+
|| executionOptions.remoteRetryOnCacheEviction > 0) {
561562
detailedCode = FailureDetails.Spawn.Code.REMOTE_CACHE_EVICTED;
562563
} else {
563564
detailedCode = FailureDetails.Spawn.Code.REMOTE_CACHE_FAILED;

src/main/java/com/google/devtools/build/lib/runtime/BlazeCommandDispatcher.java

+42-14
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
import com.google.devtools.build.lib.events.PrintingEventHandler;
4444
import com.google.devtools.build.lib.events.Reporter;
4545
import com.google.devtools.build.lib.events.StoredEventHandler;
46+
import com.google.devtools.build.lib.exec.ExecutionOptions;
4647
import com.google.devtools.build.lib.profiler.MemoryProfiler;
4748
import com.google.devtools.build.lib.profiler.Profiler;
4849
import com.google.devtools.build.lib.profiler.SilentCloseable;
@@ -54,6 +55,7 @@
5455
import com.google.devtools.build.lib.util.AnsiStrippingOutputStream;
5556
import com.google.devtools.build.lib.util.DebugLoggerConfigurator;
5657
import com.google.devtools.build.lib.util.DetailedExitCode;
58+
import com.google.devtools.build.lib.util.ExitCode;
5759
import com.google.devtools.build.lib.util.InterruptedFailureDetails;
5860
import com.google.devtools.build.lib.util.LoggingUtil;
5961
import com.google.devtools.build.lib.util.Pair;
@@ -230,18 +232,29 @@ public BlazeCommandResult exec(
230232
return createDetailedCommandResult(
231233
retrievedShutdownReason, FailureDetails.Command.Code.PREVIOUSLY_SHUTDOWN);
232234
}
233-
BlazeCommandResult result =
234-
execExclusively(
235-
originalCommandLine,
236-
invocationPolicy,
237-
args,
238-
outErr,
239-
firstContactTimeMillis,
240-
commandName,
241-
command,
242-
waitTimeInMs,
243-
startupOptionsTaggedWithBazelRc,
244-
commandExtensions);
235+
BlazeCommandResult result;
236+
int attempt = 0;
237+
while (true) {
238+
try {
239+
result =
240+
execExclusively(
241+
originalCommandLine,
242+
invocationPolicy,
243+
args,
244+
outErr,
245+
firstContactTimeMillis,
246+
commandName,
247+
command,
248+
waitTimeInMs,
249+
startupOptionsTaggedWithBazelRc,
250+
commandExtensions,
251+
attempt);
252+
break;
253+
} catch (RemoteCacheEvictedException e) {
254+
outErr.printErrLn("Found remote cache eviction error, retrying the build...");
255+
attempt += 1;
256+
}
257+
}
245258
if (result.shutdown()) {
246259
setShutdownReason(
247260
"Server shut down "
@@ -289,7 +302,9 @@ private BlazeCommandResult execExclusively(
289302
BlazeCommand command,
290303
long waitTimeInMs,
291304
Optional<List<Pair<String, String>>> startupOptionsTaggedWithBazelRc,
292-
List<Any> commandExtensions) {
305+
List<Any> commandExtensions,
306+
int attempt)
307+
throws RemoteCacheEvictedException {
293308
// Record the start time for the profiler. Do not put anything before this!
294309
long execStartTimeNanos = runtime.getClock().nanoTime();
295310

@@ -631,7 +646,18 @@ private BlazeCommandResult execExclusively(
631646
}
632647

633648
needToCallAfterCommand = false;
634-
return runtime.afterCommand(env, result);
649+
var newResult = runtime.afterCommand(env, result);
650+
if (newResult.getExitCode().equals(ExitCode.REMOTE_CACHE_EVICTED)) {
651+
var executionOptions =
652+
Preconditions.checkNotNull(options.getOptions(ExecutionOptions.class));
653+
if (attempt < executionOptions.remoteRetryOnCacheEviction) {
654+
throw new RemoteCacheEvictedException();
655+
}
656+
}
657+
658+
return newResult;
659+
} catch (RemoteCacheEvictedException e) {
660+
throw e;
635661
} catch (Throwable e) {
636662
logger.atSevere().withCause(e).log("Shutting down due to exception");
637663
Crash crash = Crash.from(e);
@@ -665,6 +691,8 @@ private BlazeCommandResult execExclusively(
665691
}
666692
}
667693

694+
private static class RemoteCacheEvictedException extends IOException {}
695+
668696
private static void replayEarlyExitEvents(
669697
OutErr outErr,
670698
BlazeOptionHandler optionHandler,

src/test/java/com/google/devtools/build/lib/remote/BuildWithoutTheBytesIntegrationTest.java

+1-3
Original file line numberDiff line numberDiff line change
@@ -528,9 +528,7 @@ public void remoteCacheEvictBlobs_whenPrefetchingInput_exitWithCode39() throws E
528528
// Assert: Exit code is 39
529529
assertThat(error)
530530
.hasMessageThat()
531-
.contains(
532-
"Build without the Bytes does not work if your remote cache evicts blobs"
533-
+ " during builds");
531+
.contains("Failed to fetch blobs because they do not exist remotely");
534532
assertThat(error).hasMessageThat().contains(String.format("%s/%s", hashCode, bytes.length));
535533
assertThat(error.getDetailedExitCode().getExitCode().getNumericExitCode()).isEqualTo(39);
536534
}

src/test/shell/bazel/remote/build_without_the_bytes_test.sh

+59
Original file line numberDiff line numberDiff line change
@@ -1627,4 +1627,63 @@ end_of_record"
16271627
expect_log "$expected_result"
16281628
}
16291629

1630+
function test_remote_cache_eviction_retries() {
1631+
mkdir -p a
1632+
1633+
cat > a/BUILD <<'EOF'
1634+
genrule(
1635+
name = 'foo',
1636+
srcs = ['foo.in'],
1637+
outs = ['foo.out'],
1638+
cmd = 'cat $(SRCS) > $@',
1639+
)
1640+
1641+
genrule(
1642+
name = 'bar',
1643+
srcs = ['foo.out', 'bar.in'],
1644+
outs = ['bar.out'],
1645+
cmd = 'cat $(SRCS) > $@',
1646+
tags = ['no-remote-exec'],
1647+
)
1648+
EOF
1649+
1650+
echo foo > a/foo.in
1651+
echo bar > a/bar.in
1652+
1653+
# Populate remote cache
1654+
bazel build \
1655+
--remote_executor=grpc://localhost:${worker_port} \
1656+
--remote_download_minimal \
1657+
//a:bar >& $TEST_log || fail "Failed to build"
1658+
1659+
bazel clean
1660+
1661+
# Clean build, foo.out isn't downloaded
1662+
bazel build \
1663+
--remote_executor=grpc://localhost:${worker_port} \
1664+
--remote_download_minimal \
1665+
//a:bar >& $TEST_log || fail "Failed to build"
1666+
1667+
if [[ -f bazel-bin/a/foo.out ]]; then
1668+
fail "Expected intermediate output bazel-bin/a/foo.out to not be downloaded"
1669+
fi
1670+
1671+
# Evict blobs from remote cache
1672+
stop_worker
1673+
start_worker
1674+
1675+
echo "updated bar" > a/bar.in
1676+
1677+
# Incremental build triggers remote cache eviction error but Bazel
1678+
# automatically retries the build and reruns the generating actions for
1679+
# missing blobs
1680+
bazel build \
1681+
--remote_executor=grpc://localhost:${worker_port} \
1682+
--remote_download_minimal \
1683+
--experimental_remote_cache_eviction_retries=5 \
1684+
//a:bar >& $TEST_log || fail "Failed to build"
1685+
1686+
expect_log "Found remote cache eviction error, retrying the build..."
1687+
}
1688+
16301689
run_suite "Build without the Bytes tests"

0 commit comments

Comments
 (0)