[tools] Add tool to copy table data to another table

Copy table data to another table; the two tables could be in the same cluster or not. The two tables must have the same table schema, but could have different partition schemas. Alternatively, the tool can create the new table using the same table and partition schema as the source table. It's useful to use this tool to copy small tables, or just copy table schemas in a convenient way, and it's recommend to use Java client's Backup/Restore features to copy large tables. Change-Id: Ifdec51701ac9ec57739b1a6f7c18786294642a54
acelyc111 · Mar 22, 2019 · 341ce19 · 341ce19
1 parent 28c7067
commit 341ce19
Show file tree

Hide file tree

Showing 13 changed files with 808 additions and 72 deletions.
diff --git a/src/kudu/client/scan_batch.h b/src/kudu/client/scan_batch.h
@@ -42,6 +42,7 @@ class Schema;
 
 namespace tools {
 class ReplicaDumper;
+class TableScanner;
 } // namespace tools
 
 namespace client {
@@ -294,6 +295,7 @@ class KUDU_EXPORT KuduScanBatch::RowPtr {
 
  private:
   friend class KuduScanBatch;
+  friend class tools::TableScanner;
   template<typename KeyTypeWrapper> friend struct SliceKeysTestSetup;
   template<typename KeyTypeWrapper> friend struct IntKeysTestSetup;
 

diff --git a/src/kudu/client/schema.cc b/src/kudu/client/schema.cc
@@ -759,7 +759,7 @@ string KuduSchema::ToString() const {
 }
 
 KuduSchema KuduSchema::FromSchema(const Schema& schema) {
-  return KuduSchema(schema);
+  return KuduSchema(schema.CopyWithoutColumnIds());
 }
 
 Schema KuduSchema::ToSchema(const KuduSchema& kudu_schema) {

diff --git a/src/kudu/common/partial_row.h b/src/kudu/common/partial_row.h
@@ -52,6 +52,10 @@ namespace tablet {
   template<typename KeyTypeWrapper> struct NumTypeRowOps;   // IWYU pragma: keep
 } // namespace tablet
 
+namespace tools {
+class TableScanner;
+} // namespace tools
+
 /// @endcond
 
 class Schema;
@@ -500,6 +504,7 @@ class KUDU_EXPORT KuduPartialRow {
   friend class PartitionSchema;
   friend class RowOperationsPBDecoder;
   friend class RowOperationsPBEncoder;
+  friend class tools::TableScanner;
   friend class TestScanSpec;
   template<typename KeyTypeWrapper> friend struct client::SliceKeysTestSetup;
   template<typename KeyTypeWrapper> friend struct client::IntKeysTestSetup;

diff --git a/src/kudu/common/partition.h b/src/kudu/common/partition.h
@@ -136,6 +136,15 @@ class Partition {
 // the methods which format individual partition keys do redact.
 class PartitionSchema {
  public:
+  struct RangeSchema {
+    std::vector<ColumnId> column_ids;
+  };
+
+  struct HashBucketSchema {
+    std::vector<ColumnId> column_ids;
+    int32_t num_buckets;
+    uint32_t seed;
+  };
 
   // Deserializes a protobuf message into a partition schema.
   static Status FromPB(const PartitionSchemaPB& pb,
@@ -234,21 +243,25 @@ class PartitionSchema {
   // contain unredacted row data.
   Status MakeUpperBoundRangePartitionKeyExclusive(KuduPartialRow* row) const;
 
+  // Decodes a range partition key into a partial row, with variable-length
+  // fields stored in the arena.
+  Status DecodeRangeKey(Slice* encode_key,
+                        KuduPartialRow* partial_row,
+                        Arena* arena) const;
+
+  const RangeSchema& range_partition_schema() const {
+    return range_schema_;
+  }
+
+  const std::vector<HashBucketSchema>& hash_partition_schemas() const {
+    return hash_bucket_schemas_;
+  }
+
  private:
   friend class PartitionPruner;
   FRIEND_TEST(PartitionTest, TestIncrementRangePartitionBounds);
   FRIEND_TEST(PartitionTest, TestIncrementRangePartitionStringBounds);
 
-  struct RangeSchema {
-    std::vector<ColumnId> column_ids;
-  };
-
-  struct HashBucketSchema {
-    std::vector<ColumnId> column_ids;
-    int32_t num_buckets;
-    uint32_t seed;
-  };
-
   // Returns a text description of the encoded range key suitable for debug printing.
   std::string RangeKeyDebugString(Slice range_key, const Schema& schema) const;
   std::string RangeKeyDebugString(const KuduPartialRow& key) const;
@@ -318,12 +331,6 @@ class PartitionSchema {
   // This method is useful used for encoding splits and bounds.
   Status EncodeRangeKey(const KuduPartialRow& row, const Schema& schema, std::string* key) const;
 
-  // Decodes a range partition key into a partial row, with variable-length
-  // fields stored in the arena.
-  Status DecodeRangeKey(Slice* encode_key,
-                        KuduPartialRow* partial_row,
-                        Arena* arena) const;
-
   // Decodes the hash bucket component of a partition key into its buckets.
   //
   // This should only be called with partition keys created from a row, not with

diff --git a/src/kudu/common/schema.cc b/src/kudu/common/schema.cc
@@ -345,7 +345,6 @@ Schema Schema::CopyWithColumnIds() const {
 }
 
 Schema Schema::CopyWithoutColumnIds() const {
-  CHECK(has_column_ids());
   return Schema(cols_, num_key_columns_);
 }
 
@@ -430,6 +429,7 @@ string Schema::ToString(ToStringMode mode) const {
   if (cols_.empty()) return "()";
 
   vector<string> pk_strs;
+  pk_strs.reserve(num_key_columns_);
   for (int i = 0; i < num_key_columns_; i++) {
     pk_strs.push_back(cols_[i].name());
   }

diff --git a/src/kudu/common/schema.h b/src/kudu/common/schema.h
@@ -696,8 +696,6 @@ class Schema {
 
   // Return a new Schema which is the same as this one, but without any column
   // IDs assigned.
-  //
-  // Requires that this schema has column IDs.
   Schema CopyWithoutColumnIds() const;
 
   // Create a new schema containing only the selected columns.

diff --git a/src/kudu/integration-tests/test_workload.cc b/src/kudu/integration-tests/test_workload.cc
@@ -93,12 +93,11 @@ void TestWorkload::set_schema(const client::KuduSchema& schema) {
   CHECK_GT(schema.num_columns(), 0) << "Schema should have at least one column";
   std::vector<int> key_indexes;
   schema.GetPrimaryKeyColumnIndexes(&key_indexes);
-  CHECK_EQ(1, key_indexes.size()) << "Schema should have just one key column";
-  CHECK_EQ(0, key_indexes[0]) << "Schema's key column should be index 0";
+  CHECK_LE(1, key_indexes.size()) << "Schema should have at least one key column";
+  CHECK_EQ(0, key_indexes[0]) << "Schema's first key column should be index 0";
   KuduColumnSchema key = schema.Column(0);
-  CHECK_EQ("key", key.name()) << "Schema column should be named 'key'";
   CHECK_EQ(KuduColumnSchema::INT32, key.type())
-      << "Schema key column should be of type INT32";
+      << "Schema's first key column should be of type INT32";
   schema_ = schema;
 }
 

diff --git a/src/kudu/tools/data_gen_util.cc b/src/kudu/tools/data_gen_util.cc
@@ -61,6 +61,15 @@ void WriteValueToColumn(const client::KuduSchema& schema,
     case client::KuduColumnSchema::BOOL:
       CHECK_OK(row->SetBool(col_idx, value));
       break;
+    case client::KuduColumnSchema::BINARY:
+      CHECK_OK(row->SetBinaryCopy(col_idx, FastHex64ToBuffer(value, buf)));
+      break;
+    case client::KuduColumnSchema::UNIXTIME_MICROS:
+      CHECK_OK(row->SetUnixTimeMicros(col_idx, value));
+      break;
+    case client::KuduColumnSchema::DECIMAL:
+      CHECK_OK(row->SetUnscaledDecimal(col_idx, value));
+      break;
     default:
       LOG(FATAL) << "Unexpected data type: " << type;
   }

diff --git a/src/kudu/tools/kudu-admin-test.cc b/src/kudu/tools/kudu-admin-test.cc
@@ -89,7 +89,6 @@ using kudu::client::KuduInsert;
 using kudu::client::KuduSchema;
 using kudu::client::KuduSchemaBuilder;
 using kudu::client::KuduTable;
-using kudu::client::KuduTableAlterer;
 using kudu::client::KuduTableCreator;
 using kudu::client::KuduValue;
 using kudu::client::sp::shared_ptr;