Skip to content

Commit ea0289c

Browse files
committed
Canonicalize filename when comparing for duplicates
1 parent 6a79492 commit ea0289c

File tree

2 files changed

+41
-1
lines changed

2 files changed

+41
-1
lines changed

tests/unit/forklift/test_legacy.py

+39
Original file line numberDiff line numberDiff line change
@@ -666,6 +666,45 @@ def test_is_duplicate_true(self, pyramid_config, db_request):
666666

667667
assert legacy._is_duplicate_file(db_request.db, filename, hashes)
668668

669+
@pytest.mark.parametrize("file_name_prefix", ["foo_bar", "foo.bar"])
670+
@pytest.mark.parametrize("project_name", ["foo_bar", "foo.bar"])
671+
def test_is_duplicate_true_non_normalized_filename(
672+
self, pyramid_config, db_request, file_name_prefix, project_name
673+
):
674+
pyramid_config.testing_securitypolicy(userid=1)
675+
676+
user = UserFactory.create()
677+
EmailFactory.create(user=user)
678+
project = ProjectFactory.create(name=project_name)
679+
release = ReleaseFactory.create(project=project, version="1.0")
680+
RoleFactory.create(user=user, project=project)
681+
682+
filename = "{}-{}.tar.gz".format(project_name, release.version)
683+
file_content = io.BytesIO(_TAR_GZ_PKG_TESTDATA)
684+
file_value = file_content.getvalue()
685+
686+
hashes = {
687+
"sha256": hashlib.sha256(file_value).hexdigest(),
688+
"md5": hashlib.md5(file_value).hexdigest(),
689+
"blake2_256": hashlib.blake2b(file_value, digest_size=256 // 8).hexdigest(),
690+
}
691+
db_request.db.add(
692+
File(
693+
release=release,
694+
filename=filename,
695+
md5_digest=hashes["md5"],
696+
sha256_digest=hashes["sha256"],
697+
blake2_256_digest=hashes["blake2_256"],
698+
path="source/{name[0]}/{name}/{filename}".format(
699+
name=project.name, filename=filename
700+
),
701+
)
702+
)
703+
704+
duplicate_filename = "{}-{}.tar.gz".format(file_name_prefix, release.version)
705+
706+
assert legacy._is_duplicate_file(db_request.db, duplicate_filename, hashes)
707+
669708
def test_is_duplicate_none(self, pyramid_config, db_request):
670709
pyramid_config.testing_securitypolicy(userid=1)
671710

warehouse/forklift/legacy.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -766,7 +766,8 @@ def _is_duplicate_file(db_session, filename, hashes):
766766

767767
if file_ is not None:
768768
return (
769-
file_.filename == filename
769+
# This has the effect of canonicalizing the project name and version
770+
_parse_filename(file_.filename) == _parse_filename(filename)
770771
and file_.sha256_digest == hashes["sha256"]
771772
and file_.md5_digest == hashes["md5"]
772773
and file_.blake2_256_digest == hashes["blake2_256"]

0 commit comments

Comments
 (0)