Skip to content

Commit e7218d5

Browse files
sluongngcopybara-github
authored andcommitted
BES: make uploader retry attempts configurable
Depends on different Build Event Service setup, there could be different failure modes that may tolerate less or more failures for Build Events uploading. Allow users to tweak the number without having to use a custom JVM args or shipping a fork of Bazel with these number tweaked. Closes bazelbuild#16305. PiperOrigin-RevId: 482303303 Change-Id: I71d9aeaf7527b0ff1a81af069390eedee2c22aa0
1 parent 58edc17 commit e7218d5

File tree

3 files changed

+42
-17
lines changed

3 files changed

+42
-17
lines changed

src/main/java/com/google/devtools/build/lib/buildeventservice/BuildEventServiceUploader.java

+8-11
Original file line numberDiff line numberDiff line change
@@ -93,12 +93,6 @@
9393
public final class BuildEventServiceUploader implements Runnable {
9494
private static final GoogleLogger logger = GoogleLogger.forEnclosingClass();
9595

96-
/** Configuration knobs related to RPC retries. Values chosen by good judgement. */
97-
private static final int MAX_NUM_RETRIES =
98-
Integer.parseInt(System.getProperty("BAZEL_BES_NUM_RETRIES_ON_RPC_FAILURE", "4"));
99-
100-
private static final int DELAY_MILLIS = 1000;
101-
10296
private final BuildEventServiceClient besClient;
10397
private final BuildEventArtifactUploader buildEventUploader;
10498
private final BuildEventServiceProtoUtil besProtoUtil;
@@ -544,7 +538,7 @@ private void publishBuildEvents()
544538
BuildProgress.Code.BES_STREAM_NOT_RETRYING_FAILURE,
545539
message);
546540
}
547-
if (retryAttempt == MAX_NUM_RETRIES) {
541+
if (retryAttempt == buildEventProtocolOptions.besUploadMaxRetries) {
548542
String message =
549543
String.format(
550544
"Not retrying publishBuildEvents, no more attempts left: status='%s'",
@@ -629,7 +623,7 @@ private void publishLifecycleEvent(PublishLifecycleEventRequest request)
629623
throws DetailedStatusException, InterruptedException {
630624
int retryAttempt = 0;
631625
StatusException cause = null;
632-
while (retryAttempt <= MAX_NUM_RETRIES) {
626+
while (retryAttempt <= this.buildEventProtocolOptions.besUploadMaxRetries) {
633627
try {
634628
besClient.publish(request);
635629
return;
@@ -656,7 +650,7 @@ private void publishLifecycleEvent(PublishLifecycleEventRequest request)
656650
throw withFailureDetail(
657651
cause,
658652
BuildProgress.Code.BES_UPLOAD_RETRY_LIMIT_EXCEEDED_FAILURE,
659-
"All retry attempts failed.");
653+
String.format("All %d retry attempts failed.", retryAttempt - 1));
660654
}
661655

662656
private void ensureUploadThreadStarted() {
@@ -723,9 +717,12 @@ private static boolean shouldRetryStatus(Status status) {
723717
&& !status.getCode().equals(Code.FAILED_PRECONDITION);
724718
}
725719

726-
private static long retrySleepMillis(int attempt) {
720+
private long retrySleepMillis(int attempt) {
721+
Preconditions.checkArgument(attempt >= 0, "attempt must be nonnegative: %s", attempt);
727722
// This somewhat matches the backoff used for gRPC connection backoffs.
728-
return (long) (DELAY_MILLIS * Math.pow(1.6, attempt));
723+
return (long)
724+
(this.buildEventProtocolOptions.besUploadRetryInitialDelay.toMillis()
725+
* Math.pow(1.6, attempt));
729726
}
730727

731728
private DetailedStatusException withFailureDetail(

src/main/java/com/google/devtools/build/lib/buildeventstream/BuildEventProtocolOptions.java

+24-6
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import com.google.devtools.common.options.OptionDocumentationCategory;
1919
import com.google.devtools.common.options.OptionEffectTag;
2020
import com.google.devtools.common.options.OptionsBase;
21+
import java.time.Duration;
2122

2223
/** Options used to configure the build event protocol. */
2324
public class BuildEventProtocolOptions extends OptionsBase {
@@ -34,14 +35,31 @@ public class BuildEventProtocolOptions extends OptionsBase {
3435
public boolean legacyImportantOutputs;
3536

3637
@Option(
37-
name = "experimental_build_event_upload_strategy",
38-
defaultValue = "null",
39-
documentationCategory = OptionDocumentationCategory.LOGGING,
40-
effectTags = {OptionEffectTag.AFFECTS_OUTPUTS},
41-
help = "Selects how to upload artifacts referenced in the build event protocol."
42-
)
38+
name = "experimental_build_event_upload_strategy",
39+
defaultValue = "null",
40+
documentationCategory = OptionDocumentationCategory.LOGGING,
41+
effectTags = {OptionEffectTag.AFFECTS_OUTPUTS},
42+
help = "Selects how to upload artifacts referenced in the build event protocol.")
4343
public String buildEventUploadStrategy;
4444

45+
@Option(
46+
name = "experimental_build_event_upload_max_retries",
47+
defaultValue = "4",
48+
documentationCategory = OptionDocumentationCategory.LOGGING,
49+
effectTags = {OptionEffectTag.BAZEL_INTERNAL_CONFIGURATION},
50+
help = "The maximum number of times Bazel should retry uploading a build event.")
51+
public int besUploadMaxRetries;
52+
53+
@Option(
54+
name = "experimental_build_event_upload_retry_minimum_delay",
55+
defaultValue = "1s",
56+
documentationCategory = OptionDocumentationCategory.LOGGING,
57+
effectTags = {OptionEffectTag.BAZEL_INTERNAL_CONFIGURATION},
58+
help =
59+
"Initial, minimum delay for exponential backoff retries when BEP upload fails. (exponent:"
60+
+ " 1.6)")
61+
public Duration besUploadRetryInitialDelay;
62+
4563
@Option(
4664
name = "experimental_stream_log_file_uploads",
4765
defaultValue = "false",

src/test/java/com/google/devtools/build/lib/buildeventservice/BazelBuildEventServiceModuleTest.java

+10
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,16 @@ public void testCreatesStreamerForBesTransport() throws Exception {
232232
.isInstanceOf(BuildEventServiceTransport.class);
233233
}
234234

235+
@Test
236+
public void testRetryCount() throws Exception {
237+
runBuildWithOptions(
238+
"--bes_backend=does.not.exist:1234", "--experimental_build_event_upload_max_retries=3");
239+
afterBuildCommand();
240+
241+
events.assertContainsError(
242+
"The Build Event Protocol upload failed: All 3 retry attempts failed");
243+
}
244+
235245
@Test
236246
public void testConnectivityFailureDisablesBesStreaming() throws Exception {
237247
class FailingConnectivityStatusProvider extends BlazeModule

0 commit comments

Comments
 (0)