Skip to content

Commit

Permalink
[txns] fix race between tasks and destructor
Browse files Browse the repository at this point in the history
It previously possible for a CommitTask to be destructed before
completing the loop of scheduling all asynchronous tasks. This led to a
race as seen below:

WARNING: ThreadSanitizer: data race (pid=32435)
  Write of size 8 at 0x7b1c000ce2d8 by thread T105 (mutexes: write M424881254664896540):
    #0 std::__1::__vector_base<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > >::__destruct_at_end(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >*) /data/3/awong/Repositories/kudu/thirdparty/installed/tsan/include/c++/v1/vector:427:12 (txn_commit-itest+0x576cb1)
    #1 std::__1::__vector_base<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > >::clear() /data/3/awong/Repositories/kudu/thirdparty/installed/tsan/include/c++/v1/vector:369:29 (txn_commit-itest+0x5770d1)
    #2 std::__1::__vector_base<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > >::~__vector_base() /data/3/awong/Repositories/kudu/thirdparty/installed/tsan/include/c++/v1/vector:463:9 (txn_commit-itest+0x59caf9)
    #3 std::__1::vector<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > >::~vector() /data/3/awong/Repositories/kudu/thirdparty/installed/tsan/include/c++/v1/vector:555:5 (libtransactions.so+0x8c2a0)
    #4 kudu::transactions::CommitTasks::~CommitTasks() ../src/kudu/transactions/txn_status_manager.h:177:26 (libtransactions.so+0xcce8b)
    #5 kudu::RefCountedThreadSafe<kudu::transactions::CommitTasks, kudu::DefaultRefCountedThreadSafeTraits<kudu::transactions::CommitTasks> >::DeleteInternal(kudu::transactions::CommitTasks const*) ../src/kudu/gutil/ref_counted.h:153:44 (libtransactions.so+0xcce1a)
    #6 kudu::DefaultRefCountedThreadSafeTraits<kudu::transactions::CommitTasks>::Destruct(kudu::transactions::CommitTasks const*) ../src/kudu/gutil/ref_counted.h:116:5 (libtransactions.so+0xccdc8)
    #7 kudu::RefCountedThreadSafe<kudu::transactions::CommitTasks, kudu::DefaultRefCountedThreadSafeTraits<kudu::transactions::CommitTasks> >::Release() const ../src/kudu/gutil/ref_counted.h:144:7 (libtransactions.so+0xccd70)
    #8 scoped_refptr<kudu::transactions::CommitTasks>::~scoped_refptr() ../src/kudu/gutil/ref_counted.h:266:13 (libtransactions.so+0xbf785)
    #9 std::__1::pair<long const, scoped_refptr<kudu::transactions::CommitTasks> >::~pair() /data/3/awong/Repositories/kudu/thirdparty/installed/tsan/include/c++/v1/utility:315:29 (libtransactions.so+0xc7652)
    #10 void std::__1::allocator_traits<std::__1::allocator<std::__1::__hash_node<std::__1::__hash_value_type<long, scoped_refptr<kudu::transactions::CommitTasks> >, void*> > >::__destroy<std::__1::pair<long const, scoped_refptr<kudu::transactions::CommitTasks> > >(std::__1::integral_constant<bool, false>, std::__1::allocator<std::__1::__hash_node<std::__1::__hash_value_type<long, scoped_refptr<kudu::transactions::CommitTasks> >, void*> >&, std::__1::pair<long const, scoped_refptr<kudu::transactions::CommitTasks> >*) /data/3/awong/Repositories/kudu/thirdparty/installed/tsan/include/c++/v1/memory:1747:23 (libtransactions.so+0xc7614)
    #11 void std::__1::allocator_traits<std::__1::allocator<std::__1::__hash_node<std::__1::__hash_value_type<long, scoped_refptr<kudu::transactions::CommitTasks> >, void*> > >::destroy<std::__1::pair<long const, scoped_refptr<kudu::transactions::CommitTasks> > >(std::__1::allocator<std::__1::__hash_node<std::__1::__hash_value_type<long, scoped_refptr<kudu::transactions::CommitTasks> >, void*> >&, std::__1::pair<long const, scoped_refptr<kudu::transactions::CommitTasks> >*) /data/3/awong/Repositories/kudu/thirdparty/installed/tsan/include/c++/v1/memory:1595:14 (libtransactions.so+0xc7518)
    #12 std::__1::__hash_node_destructor<std::__1::allocator<std::__1::__hash_node<std::__1::__hash_value_type<long, scoped_refptr<kudu::transactions::CommitTasks> >, void*> > >::operator()(std::__1::__hash_node<std::__1::__hash_value_type<long, scoped_refptr<kudu::transactions::CommitTasks> >, void*>*) /data/3/awong/Repositories/kudu/thirdparty/installed/tsan/include/c++/v1/__hash_table:844:13 (libtransactions.so+0xc740d)
    #13 std::__1::unique_ptr<std::__1::__hash_node<std::__1::__hash_value_type<long, scoped_refptr<kudu::transactions::CommitTasks> >, void*>, std::__1::__hash_node_destructor<std::__1::allocator<std::__1::__hash_node<std::__1::__hash_value_type<long, scoped_refptr<kudu::transactions::CommitTasks> >, void*> > > >::reset(std::__1::__hash_node<std::__1::__hash_value_type<long, scoped_refptr<kudu::transactions::CommitTasks> >, void*>*) /data/3/awong/Repositories/kudu/thirdparty/installed/tsan/include/c++/v1/memory:2593:7 (libtransactions.so+0xc72e0)
    #14 std::__1::unique_ptr<std::__1::__hash_node<std::__1::__hash_value_type<long, scoped_refptr<kudu::transactions::CommitTasks> >, void*>, std::__1::__hash_node_destructor<std::__1::allocator<std::__1::__hash_node<std::__1::__hash_value_type<long, scoped_refptr<kudu::transactions::CommitTasks> >, void*> > > >::~unique_ptr() /data/3/awong/Repositories/kudu/thirdparty/installed/tsan/include/c++/v1/memory:2547:19 (libtransactions.so+0xc6cbc)
    #15 std::__1::__hash_table<std::__1::__hash_value_type<long, scoped_refptr<kudu::transactions::CommitTasks> >, std::__1::__unordered_map_hasher<long, std::__1::__hash_value_type<long, scoped_refptr<kudu::transactions::CommitTasks> >, std::__1::hash<long>, true>, std::__1::__unordered_map_equal<long, std::__1::__hash_value_type<long, scoped_refptr<kudu::transactions::CommitTasks> >, std::__1::equal_to<long>, true>, std::__1::allocator<std::__1::__hash_value_type<long, scoped_refptr<kudu::transactions::CommitTasks> > > >::erase(std::__1::__hash_const_iterator<std::__1::__hash_node<std::__1::__hash_value_type<long, scoped_refptr<kudu::transactions::CommitTasks> >, void*>*>) /data/3/awong/Repositories/kudu/thirdparty/installed/tsan/include/c++/v1/__hash_table:2598:5 (libtransactions.so+0xc676e)
    apache#16 std::__1::unordered_map<long, scoped_refptr<kudu::transactions::CommitTasks>, std::__1::hash<long>, std::__1::equal_to<long>, std::__1::allocator<std::__1::pair<long const, scoped_refptr<kudu::transactions::CommitTasks> > > >::erase(std::__1::__hash_map_iterator<std::__1::__hash_iterator<std::__1::__hash_node<std::__1::__hash_value_type<long, scoped_refptr<kudu::transactions::CommitTasks> >, void*>*> >) /data/3/awong/Repositories/kudu/thirdparty/installed/tsan/include/c++/v1/unordered_map:1193:57 (libtransactions.so+0xc5b40)
    apache#17 kudu::transactions::TxnStatusManager::RemoveCommitTask(long, kudu::transactions::CommitTasks const*) ../src/kudu/transactions/txn_status_manager.h:433:26 (libtransactions.so+0xbefc6)
    apache#18 kudu::transactions::CommitTasks::IsShuttingDownCleanupIfLastOp() ../src/kudu/transactions/txn_status_manager.cc:181:28 (libtransactions.so+0x97dea)
    apache#19 kudu::transactions::CommitTasks::AbortTxnAsyncTask(int)::$_2::operator()(kudu::Status const&) const ../src/kudu/transactions/txn_status_manager.cc:319:9 (libtransactions.so+0xaefd6)
    apache#20 decltype(std::__1::forward<kudu::transactions::CommitTasks::AbortTxnAsyncTask(int)::$_2&>(fp)(std::__1::forward<kudu::Status const&>(fp0))) std::__1::__invoke<kudu::transactions::CommitTasks::AbortTxnAsyncTask(int)::$_2&, kudu::Status const&>(kudu::transactions::CommitTasks::AbortTxnAsyncTask(int)::$_2&, kudu::Status const&) /data/3/awong/Repositories/kudu/thirdparty/installed/tsan/include/c++/v1/type_traits:3530:1 (libtransactions.so+0xaeefd)
    apache#21 void std::__1::__invoke_void_return_wrapper<void>::__call<kudu::transactions::CommitTasks::AbortTxnAsyncTask(int)::$_2&, kudu::Status const&>(kudu::transactions::CommitTasks::AbortTxnAsyncTask(int)::$_2&, kudu::Status const&) /data/3/awong/Repositories/kudu/thirdparty/installed/tsan/include/c++/v1/__functional_base:348:9 (libtransactions.so+0xaee3d)
    apache#22 std::__1::__function::__alloc_func<kudu::transactions::CommitTasks::AbortTxnAsyncTask(int)::$_2, std::__1::allocator<kudu::transactions::CommitTasks::AbortTxnAsyncTask(int)::$_2>, void (kudu::Status const&)>::operator()(kudu::Status const&) /data/3/awong/Repositories/kudu/thirdparty/installed/tsan/include/c++/v1/functional:1533:16 (libtransactions.so+0xaedbd)
    apache#23 std::__1::__function::__func<kudu::transactions::CommitTasks::AbortTxnAsyncTask(int)::$_2, std::__1::allocator<kudu::transactions::CommitTasks::AbortTxnAsyncTask(int)::$_2>, void (kudu::Status const&)>::operator()(kudu::Status const&) /data/3/awong/Repositories/kudu/thirdparty/installed/tsan/include/c++/v1/functional:1707:12 (libtransactions.so+0xad06c)
    apache#24 std::__1::__function::__value_func<void (kudu::Status const&)>::operator()(kudu::Status const&) const /data/3/awong/Repositories/kudu/thirdparty/installed/tsan/include/c++/v1/functional:1860:16 (libmaster.so+0x32ca24)
    apache#25 std::__1::function<void (kudu::Status const&)>::operator()(kudu::Status const&) const /data/3/awong/Repositories/kudu/thirdparty/installed/tsan/include/c++/v1/functional:2419:12 (libmaster.so+0x31d80b)
    apache#26 kudu::transactions::ParticipantRpc::Finish(kudu::Status const&) ../src/kudu/transactions/participant_rpc.cc:227:3 (libtransactions.so+0x7f3e7)
    ...

  Previous read of size 8 at 0x7b1c000ce2d8 by thread T186 (mutexes: read M322424363142217872):
    #0 std::__1::vector<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > >::size() const /data/3/awong/Repositories/kudu/thirdparty/installed/tsan/include/c++/v1/vector:656:46 (libtransactions.so+0x8d2f9)
    #1 kudu::transactions::CommitTasks::AbortTxnAsync() ../src/kudu/transactions/txn_status_manager.cc:365:42 (libtransactions.so+0x989d2)
    #2 kudu::transactions::TxnStatusManager::BeginAbortTransaction(long, boost::optional<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > const&, kudu::tserver::TabletServerErrorPB*) ../src/kudu/transactions/txn_status_manager.cc:1219:25 (libtransactions.so+0xa3cc6)
    #3 kudu::transactions::CommitTasks::ScheduleBeginAbortTxnWrite()::$_3::operator()() const ../src/kudu/transactions/txn_status_manager.cc:378:3 (libtransactions.so+0xb245d)
    #4 decltype(std::__1::forward<kudu::transactions::CommitTasks::ScheduleBeginAbortTxnWrite()::$_3&>(fp)()) std::__1::__invoke<kudu::transactions::CommitTasks::ScheduleBeginAbortTxnWrite()::$_3&>(kudu::transactions::CommitTasks::ScheduleBeginAbortTxnWrite()::$_3&) /data/3/awong/Repositories/kudu/thirdparty/installed/tsan/include/c++/v1/type_traits:3530:1 (libtransactions.so+0xb2180)
    #5 void std::__1::__invoke_void_return_wrapper<void>::__call<kudu::transactions::CommitTasks::ScheduleBeginAbortTxnWrite()::$_3&>(kudu::transactions::CommitTasks::ScheduleBeginAbortTxnWrite()::$_3&) /data/3/awong/Repositories/kudu/thirdparty/installed/tsan/include/c++/v1/__functional_base:348:9 (libtransactions.so+0xb20e0)
    #6 std::__1::__function::__alloc_func<kudu::transactions::CommitTasks::ScheduleBeginAbortTxnWrite()::$_3, std::__1::allocator<kudu::transactions::CommitTasks::ScheduleBeginAbortTxnWrite()::$_3>, void ()>::operator()() /data/3/awong/Repositories/kudu/thirdparty/installed/tsan/include/c++/v1/functional:1533:16 (libtransactions.so+0xb2080)
    #7 std::__1::__function::__func<kudu::transactions::CommitTasks::ScheduleBeginAbortTxnWrite()::$_3, std::__1::allocator<kudu::transactions::CommitTasks::ScheduleBeginAbortTxnWrite()::$_3>, void ()>::operator()() /data/3/awong/Repositories/kudu/thirdparty/installed/tsan/include/c++/v1/functional:1707:12 (libtransactions.so+0xb042f)
    #8 std::__1::__function::__value_func<void ()>::operator()() const /data/3/awong/Repositories/kudu/thirdparty/installed/tsan/include/c++/v1/functional:1860:16 (libtserver_test_util.so+0x58396)
    #9 std::__1::function<void ()>::operator()() const /data/3/awong/Repositories/kudu/thirdparty/installed/tsan/include/c++/v1/functional:2419:12 (libtserver_test_util.so+0x58098)
    ...

This patch fixes this by caching the size before iterating. Prior to
this patch, the test failed in TSAN mode 3/100 times. With this patch,
it passed 1000/1000 times.

Change-Id: Ic974354b300f2a6c1b04505e740249273f33b80c
Reviewed-on: http://gerrit.cloudera.org:8080/17283
Reviewed-by: Alexey Serbin <[email protected]>
Tested-by: Kudu Jenkins
  • Loading branch information
andrwng committed Apr 8, 2021
1 parent 0e1a154 commit c2e5373
Showing 1 changed file with 15 additions and 5 deletions.
20 changes: 15 additions & 5 deletions src/kudu/transactions/txn_status_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -361,8 +361,11 @@ void CommitTasks::AbortTxnAsync() {
if (participant_ids_.empty()) {
ScheduleFinalizeAbortTxnWrite();
} else {
ops_in_flight_ = participant_ids_.size();
for (int i = 0; i < participant_ids_.size(); i++) {
// NOTE: the final AbortTxnAsyncTask() call may destruct this CommitTask
// and its members, so cache the participant ID size.
const auto participant_ids_size = participant_ids_.size();
ops_in_flight_ = participant_ids_size;
for (int i = 0; i < participant_ids_size; i++) {
AbortTxnAsyncTask(i);
}
}
Expand Down Expand Up @@ -434,8 +437,11 @@ void CommitTasks::FinalizeCommitAsync() {
// tasks to complete.
auto old_val = ops_in_flight_.exchange(participant_ids_.size());
DCHECK_EQ(0, old_val);
ops_in_flight_ = participant_ids_.size();
for (int i = 0; i < participant_ids_.size(); i++) {
// NOTE: the final FinalizeCommitAsyncTask() call may destruct this
// CommitTask and its members, so cache the participant ID size.
const auto participant_ids_size = participant_ids_.size();
ops_in_flight_ = participant_ids_size;
for (int i = 0; i < participant_ids_size; i++) {
FinalizeCommitAsyncTask(i);
}
}
Expand Down Expand Up @@ -1010,10 +1016,14 @@ void CommitTasks::BeginCommitAsync() {
// If there are some participants, schedule beginning commit tasks so
// we can determine a finalized commit timestamp.
//
// NOTE: the final BeginCommitAsyncTask() call may destruct this CommitTask
// and its members, so cache the participant ID size.
//
// TODO(awong): consider an approach in which clients propagate
// timestamps in such a way that the client's call to begin commit
// includes the expected finalized commit timestamp.
for (int i = 0; i < participant_ids_.size(); i++) {
const auto participant_ids_size = participant_ids_.size();
for (int i = 0; i < participant_ids_size; i++) {
BeginCommitAsyncTask(i);
}
}
Expand Down

0 comments on commit c2e5373

Please sign in to comment.