Skip to content

Commit

Permalink
ARROW-13168: [C++][R] Enable runtime timezone database for Windows
Browse files Browse the repository at this point in the history
This allows for runtime configuration of the timezone database on Windows for C++ and R. Python will be handled later because it's available timezone libraries use the binary rather than text format, which is not yet supported the vendored date library.

For R, Windows will only support the "C" locale, since (as far as I can tell) that's the only locale supported by the MingW std::locale implementation. I think R itself gets around this by implementing a completely custom version of `strftime()` and friends.

Closes #12536 from wjones127/ARROW-13168-timezone-database

Authored-by: Will Jones <[email protected]>
Signed-off-by: Jonathan Keane <[email protected]>
  • Loading branch information
wjones127 authored and jonkeane committed Mar 28, 2022
1 parent 919d113 commit f4dfd6c
Show file tree
Hide file tree
Showing 22 changed files with 304 additions and 97 deletions.
6 changes: 6 additions & 0 deletions .github/workflows/cpp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,9 @@ jobs:
with:
fetch-depth: 0
submodules: recursive
- name: Download Timezone Database
shell: bash
run: ci/scripts/download_tz_database.sh
- name: Build
shell: bash
run: ci/scripts/cpp_build.sh $(pwd) $(pwd)/build
Expand Down Expand Up @@ -319,6 +322,9 @@ jobs:
run: |
export CMAKE_BUILD_PARALLEL_LEVEL=$NUMBER_OF_PROCESSORS
ci/scripts/cpp_build.sh "$(pwd)" "$(pwd)/build"
- name: Download Timezone Database
shell: bash
run: ci/scripts/download_tz_database.sh
- name: Download MinIO
shell: msys2 {0}
run: |
Expand Down
14 changes: 14 additions & 0 deletions ci/appveyor-cpp-setup.bat
Original file line number Diff line number Diff line change
Expand Up @@ -115,3 +115,17 @@ powershell.exe -Command "Start-Process clcache-server" || exit /B
if "%ARROW_S3%" == "ON" (
appveyor DownloadFile https://dl.min.io/server/minio/release/windows-amd64/minio.exe -FileName C:\Windows\Minio.exe || exit /B
)


@rem
@rem Download IANA Timezone Database for unit tests
@rem
@rem (Doc section: Download timezone database)
curl https://data.iana.org/time-zones/releases/tzdata2021e.tar.gz --output tzdata.tar.gz
mkdir tzdata
tar --extract --file tzdata.tar.gz --directory tzdata
move tzdata %USERPROFILE%\Downloads\tzdata
@rem Also need Windows timezone mapping
curl https://raw.githubusercontent.com/unicode-org/cldr/master/common/supplemental/windowsZones.xml ^
--output %USERPROFILE%\Downloads\tzdata\windowsZones.xml
@rem (Doc section: Download timezone database)
30 changes: 30 additions & 0 deletions ci/scripts/download_tz_database.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/usr/bin/env bash
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

set -ex

# Download database
curl https://data.iana.org/time-zones/releases/tzdata2021e.tar.gz --output ~/Downloads/tzdata2021e.tar.gz

# Extract
mkdir -p ~/Downloads/tzdata
tar --extract --file ~/Downloads/tzdata2021e.tar.gz --directory ~/Downloads/tzdata

# Download Windows timezone mapping
curl https://raw.githubusercontent.com/unicode-org/cldr/master/common/supplemental/windowsZones.xml --output ~/Downloads/tzdata/windowsZones.xml
7 changes: 0 additions & 7 deletions cpp/src/arrow/compute/kernels/scalar_cast_string.cc
Original file line number Diff line number Diff line change
Expand Up @@ -150,12 +150,6 @@ struct TemporalToStringCastFunctor<O, TimestampType> {
return Status::OK();
}));
} else {
#ifdef _WIN32
// TODO(ARROW-13168):
return Status::NotImplemented(
"Casting a timestamp with time zone to string is not yet supported on "
"Windows.");
#else
switch (ty.unit()) {
case TimeUnit::SECOND:
RETURN_NOT_OK(ConvertZoned<std::chrono::seconds>(input, timezone, &builder));
Expand All @@ -176,7 +170,6 @@ struct TemporalToStringCastFunctor<O, TimestampType> {
DCHECK(false);
return Status::NotImplemented("Unimplemented time unit");
}
#endif
}
std::shared_ptr<Array> output_array;
RETURN_NOT_OK(builder.Finish(&output_array));
Expand Down
43 changes: 14 additions & 29 deletions cpp/src/arrow/compute/kernels/scalar_cast_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include "arrow/testing/extension_type.h"
#include "arrow/testing/gtest_util.h"
#include "arrow/testing/random.h"
#include "arrow/testing/util.h"
#include "arrow/type.h"
#include "arrow/type_fwd.h"
#include "arrow/type_traits.h"
Expand Down Expand Up @@ -1146,6 +1147,16 @@ constexpr char kTimestampSecondsJson[] =
constexpr char kTimestampExtremeJson[] =
R"(["1677-09-20T00:00:59.123456", "2262-04-13T23:23:23.999999"])";

class CastTimezone : public ::testing::Test {
protected:
void SetUp() override {
#ifdef _WIN32
// Initialize timezone database on Windows
ASSERT_OK(InitTestTimezoneDatabase());
#endif
}
};

TEST(Cast, TimestampToDate) {
// See scalar_temporal_test.cc
auto timestamps = ArrayFromJSON(timestamp(TimeUnit::NANO), kTimestampJson);
Expand Down Expand Up @@ -1181,12 +1192,7 @@ TEST(Cast, TimestampToDate) {
}
}

TEST(Cast, ZonedTimestampToDate) {
#ifdef _WIN32
// TODO(ARROW-13168): we lack tzdb on Windows
GTEST_SKIP() << "ARROW-13168: no access to timezone database on Windows";
#endif

TEST_F(CastTimezone, ZonedTimestampToDate) {
{
// See TestZoned in scalar_temporal_test.cc
auto timestamps =
Expand Down Expand Up @@ -1377,12 +1383,7 @@ TEST(Cast, TimestampToTime) {
}
}

TEST(Cast, ZonedTimestampToTime) {
#ifdef _WIN32
// TODO(ARROW-13168): we lack tzdb on Windows
GTEST_SKIP() << "ARROW-13168: no access to timezone database on Windows";
#endif

TEST_F(CastTimezone, ZonedTimestampToTime) {
CheckCast(ArrayFromJSON(timestamp(TimeUnit::NANO, "Pacific/Marquesas"), kTimestampJson),
ArrayFromJSON(time64(TimeUnit::NANO), R"([
52259123456789, 50003999999999, 56480001001001, 65000000000000,
Expand Down Expand Up @@ -1573,8 +1574,7 @@ TEST(Cast, TimestampToString) {
}
}

#ifndef _WIN32
TEST(Cast, TimestampWithZoneToString) {
TEST_F(CastTimezone, TimestampWithZoneToString) {
for (auto string_type : {utf8(), large_utf8()}) {
CheckCast(
ArrayFromJSON(timestamp(TimeUnit::SECOND, "UTC"), "[-30610224000, -5364662400]"),
Expand Down Expand Up @@ -1608,21 +1608,6 @@ TEST(Cast, TimestampWithZoneToString) {
R"(["1968-11-30 13:30:44.123456789-0700", "2016-02-29 10:42:23.456789246-0700"])"));
}
}
#else
// TODO(ARROW-13168): we lack tzdb on Windows
TEST(Cast, TimestampWithZoneToString) {
for (auto string_type : {utf8(), large_utf8()}) {
ASSERT_RAISES(NotImplemented, Cast(ArrayFromJSON(timestamp(TimeUnit::SECOND, "UTC"),
"[-34226955, 1456767743]"),
CastOptions::Safe(string_type)));

ASSERT_RAISES(NotImplemented,
Cast(ArrayFromJSON(timestamp(TimeUnit::SECOND, "America/Phoenix"),
"[-34226955, 1456767743]"),
CastOptions::Safe(string_type)));
}
}
#endif

TEST(Cast, DateToDate) {
auto day_32 = ArrayFromJSON(date32(), "[0, null, 100, 1, 10]");
Expand Down
19 changes: 13 additions & 6 deletions cpp/src/arrow/compute/kernels/scalar_temporal_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include "arrow/compute/kernels/test_util.h"
#include "arrow/testing/gtest_util.h"
#include "arrow/testing/matchers.h"
#include "arrow/testing/util.h"
#include "arrow/type.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/formatting.h"
Expand Down Expand Up @@ -407,6 +408,14 @@ class ScalarTemporalTest : public ::testing::Test {
RoundTemporalOptions round_to_15_quarters =
RoundTemporalOptions(15, CalendarUnit::QUARTER);
RoundTemporalOptions round_to_15_years = RoundTemporalOptions(15, CalendarUnit::YEAR);

protected:
void SetUp() override {
#ifdef _WIN32
// Initialize timezone database on Windows
ASSERT_OK(InitTestTimezoneDatabase());
#endif
}
};

TEST_F(ScalarTemporalTest, TestTemporalComponentExtractionAllTemporalTypes) {
Expand Down Expand Up @@ -564,8 +573,6 @@ TEST_F(ScalarTemporalTest, TestOutsideNanosecondRange) {
CheckScalarUnary("subsecond", unit, times, float64(), subsecond);
}

#ifndef _WIN32
// TODO: We should test on windows once ARROW-13168 is resolved.
TEST_F(ScalarTemporalTest, TestIsLeapYear) {
auto is_leap_year_marquesas =
"[false, true, false, false, false, false, false, false, false, false, false, "
Expand Down Expand Up @@ -792,7 +799,6 @@ TEST_F(ScalarTemporalTest, TestNonexistentTimezone) {
ASSERT_RAISES(Invalid, Subsecond(timestamp_array));
}
}
#endif

TEST_F(ScalarTemporalTest, Week) {
auto unit = timestamp(TimeUnit::NANO);
Expand Down Expand Up @@ -1611,8 +1617,6 @@ TEST_F(ScalarTemporalTest, TestTemporalDifferenceErrors) {
CallFunction("weeks_between", {arr1, arr1}, &options));
}

// TODO: We should test on windows once ARROW-13168 is resolved.
#ifndef _WIN32
TEST_F(ScalarTemporalTest, TestAssumeTimezone) {
std::string timezone_utc = "UTC";
std::string timezone_kolkata = "Asia/Kolkata";
Expand Down Expand Up @@ -1879,6 +1883,9 @@ TEST_F(ScalarTemporalTest, StrftimeCLocale) {
}

TEST_F(ScalarTemporalTest, StrftimeOtherLocale) {
#ifdef _WIN32
GTEST_SKIP() << "There is a known bug in strftime for locales on Windows (ARROW-15922)";
#else
if (!LocaleExists("fr_FR.UTF-8")) {
GTEST_SKIP() << "locale 'fr_FR.UTF-8' doesn't exist on this system";
}
Expand All @@ -1890,6 +1897,7 @@ TEST_F(ScalarTemporalTest, StrftimeOtherLocale) {
["01 janvier 1970 00:00:59,123", "18 août 2021 15:11:50,456", null])";
CheckScalarUnary("strftime", timestamp(TimeUnit::MILLI, "UTC"), milliseconds, utf8(),
expected, &options);
#endif
}

TEST_F(ScalarTemporalTest, StrftimeInvalidLocale) {
Expand Down Expand Up @@ -2583,7 +2591,6 @@ TEST_F(ScalarTemporalTest, TestCeilFloorRoundTemporalKolkata) {
CheckScalarUnary("round_temporal", unit, times, unit, round_1_hours, &round_to_1_hours);
CheckScalarUnary("round_temporal", unit, times, unit, round_2_hours, &round_to_2_hours);
}
#endif // !_WIN32

} // namespace compute
} // namespace arrow
13 changes: 0 additions & 13 deletions cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1048,7 +1048,6 @@ struct RoundTemporal {
// ----------------------------------------------------------------------
// Convert timestamps to a string representation with an arbitrary format

#ifndef _WIN32
Result<std::locale> GetLocale(const std::string& locale) {
try {
return std::locale(locale.c_str());
Expand Down Expand Up @@ -1132,18 +1131,6 @@ struct Strftime {
return Status::OK();
}
};
#else
// TODO(ARROW-13168)
template <typename Duration, typename InType>
struct Strftime {
static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) {
return Status::NotImplemented("Strftime not yet implemented on windows.");
}
static Status Call(KernelContext* ctx, const ArrayData& in, ArrayData* out) {
return Status::NotImplemented("Strftime not yet implemented on windows.");
}
};
#endif

// ----------------------------------------------------------------------
// Convert string representations of timestamps in arbitrary format to timestamps
Expand Down
28 changes: 28 additions & 0 deletions cpp/src/arrow/config.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

#include "arrow/util/config.h"
#include "arrow/util/cpu_info.h"
#include "arrow/vendored/datetime.h"

namespace arrow {

Expand Down Expand Up @@ -62,6 +63,8 @@ std::string MakeSimdLevelString(QueryFlagFunction&& query_flag) {
}
}

util::optional<std::string> timezone_db_path;

}; // namespace

const BuildInfo& GetBuildInfo() { return kBuildInfo; }
Expand All @@ -73,7 +76,32 @@ RuntimeInfo GetRuntimeInfo() {
MakeSimdLevelString([&](int64_t flags) { return cpu_info->IsSupported(flags); });
info.detected_simd_level =
MakeSimdLevelString([&](int64_t flags) { return cpu_info->IsDetected(flags); });
info.using_os_timezone_db = USE_OS_TZDB;
#if !USE_OS_TZDB
info.timezone_db_path = timezone_db_path;
#else
info.timezone_db_path = util::optional<std::string>();
#endif
return info;
}

Status Initialize(const GlobalOptions& options) noexcept {
if (options.timezone_db_path.has_value()) {
#if !USE_OS_TZDB
try {
arrow_vendored::date::set_install(options.timezone_db_path.value());
arrow_vendored::date::reload_tzdb();
} catch (const std::runtime_error& e) {
return Status::IOError(e.what());
}
timezone_db_path = options.timezone_db_path.value();
#else
return Status::Invalid(
"Arrow was set to use OS timezone database at compile time, "
"so a downloaded database cannot be provided at runtime.");
#endif // !USE_OS_TZDB
}
return Status::OK();
}

} // namespace arrow
18 changes: 18 additions & 0 deletions cpp/src/arrow/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@

#include <string>

#include "arrow/status.h"
#include "arrow/util/config.h" // IWYU pragma: export
#include "arrow/util/optional.h"
#include "arrow/util/visibility.h"

namespace arrow {
Expand Down Expand Up @@ -62,6 +64,13 @@ struct RuntimeInfo {

/// The SIMD level available on the OS and CPU
std::string detected_simd_level;

/// Whether using the OS-based timezone database
/// This is set at compile-time.
bool using_os_timezone_db;

/// The path to the timezone database; by default None.
util::optional<std::string> timezone_db_path;
};

/// \brief Get runtime build info.
Expand All @@ -77,4 +86,13 @@ const BuildInfo& GetBuildInfo();
ARROW_EXPORT
RuntimeInfo GetRuntimeInfo();

struct GlobalOptions {
/// Path to text timezone database. This is only configurable on Windows,
/// which does not have a compatible OS timezone database.
util::optional<std::string> timezone_db_path;
};

ARROW_EXPORT
Status Initialize(const GlobalOptions& options) noexcept;

} // namespace arrow
Loading

0 comments on commit f4dfd6c

Please sign in to comment.