Skip to content

Commit 348852c

Browse files
CM-55207: Fix Secrets documents collecting for diff scan (#368)
1 parent 700654f commit 348852c

File tree

3 files changed

+109
-19
lines changed

3 files changed

+109
-19
lines changed

cycode/cli/apps/scan/commit_range_scanner.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@ def _scan_commit_range_documents(
186186
def _scan_sca_commit_range(ctx: typer.Context, repo_path: str, commit_range: str, **_) -> None:
187187
scan_parameters = get_scan_parameters(ctx, (repo_path,))
188188

189-
from_commit_rev, to_commit_rev = parse_commit_range(commit_range, repo_path)
189+
from_commit_rev, to_commit_rev, _ = parse_commit_range(commit_range, repo_path)
190190
from_commit_documents, to_commit_documents, _ = get_commit_range_modified_documents(
191191
ctx.obj['progress_bar'], ScanProgressBarSection.PREPARE_LOCAL_FILES, repo_path, from_commit_rev, to_commit_rev
192192
)
@@ -227,7 +227,7 @@ def _scan_secret_commit_range(
227227
def _scan_sast_commit_range(ctx: typer.Context, repo_path: str, commit_range: str, **_) -> None:
228228
scan_parameters = get_scan_parameters(ctx, (repo_path,))
229229

230-
from_commit_rev, to_commit_rev = parse_commit_range(commit_range, repo_path)
230+
from_commit_rev, to_commit_rev, _ = parse_commit_range(commit_range, repo_path)
231231
_, commit_documents, diff_documents = get_commit_range_modified_documents(
232232
ctx.obj['progress_bar'],
233233
ScanProgressBarSection.PREPARE_LOCAL_FILES,

cycode/cli/files_collector/commit_range_documents.py

Lines changed: 41 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -67,12 +67,17 @@ def collect_commit_range_diff_documents(
6767
commit_documents_to_scan = []
6868

6969
repo = git_proxy.get_repo(path)
70-
total_commits_count = int(repo.git.rev_list('--count', commit_range))
71-
logger.debug('Calculating diffs for %s commits in the commit range %s', total_commits_count, commit_range)
70+
71+
normalized_commit_range = normalize_commit_range(commit_range, path)
72+
73+
total_commits_count = int(repo.git.rev_list('--count', normalized_commit_range))
74+
logger.debug(
75+
'Calculating diffs for %s commits in the commit range %s', total_commits_count, normalized_commit_range
76+
)
7277

7378
progress_bar.set_section_length(ScanProgressBarSection.PREPARE_LOCAL_FILES, total_commits_count)
7479

75-
for scanned_commits_count, commit in enumerate(repo.iter_commits(rev=commit_range)):
80+
for scanned_commits_count, commit in enumerate(repo.iter_commits(rev=normalized_commit_range)):
7681
if _does_reach_to_max_commits_to_scan_limit(commit_ids_to_scan, max_commits_count):
7782
logger.debug('Reached to max commits to scan count. Going to scan only %s last commits', max_commits_count)
7883
progress_bar.update(ScanProgressBarSection.PREPARE_LOCAL_FILES, total_commits_count - scanned_commits_count)
@@ -96,7 +101,12 @@ def collect_commit_range_diff_documents(
96101

97102
logger.debug(
98103
'Found all relevant files in commit %s',
99-
{'path': path, 'commit_range': commit_range, 'commit_id': commit_id},
104+
{
105+
'path': path,
106+
'commit_range': commit_range,
107+
'normalized_commit_range': normalized_commit_range,
108+
'commit_id': commit_id,
109+
},
100110
)
101111

102112
logger.debug('List of commit ids to scan, %s', {'commit_ids': commit_ids_to_scan})
@@ -428,8 +438,9 @@ def get_pre_commit_modified_documents(
428438
return git_head_documents, pre_committed_documents, diff_documents
429439

430440

431-
def parse_commit_range(commit_range: str, path: str) -> tuple[Optional[str], Optional[str]]:
441+
def parse_commit_range(commit_range: str, path: str) -> tuple[Optional[str], Optional[str], Optional[str]]:
432442
"""Parses a git commit range string and returns the full SHAs for the 'from' and 'to' commits.
443+
Also, it returns the separator in the commit range.
433444
434445
Supports:
435446
- 'from..to'
@@ -440,8 +451,10 @@ def parse_commit_range(commit_range: str, path: str) -> tuple[Optional[str], Opt
440451
"""
441452
repo = git_proxy.get_repo(path)
442453

454+
separator = '..'
443455
if '...' in commit_range:
444456
from_spec, to_spec = commit_range.split('...', 1)
457+
separator = '...'
445458
elif '..' in commit_range:
446459
from_spec, to_spec = commit_range.split('..', 1)
447460
else:
@@ -459,7 +472,28 @@ def parse_commit_range(commit_range: str, path: str) -> tuple[Optional[str], Opt
459472
# Use rev_parse to resolve each specifier to its full commit SHA
460473
from_commit_rev = repo.rev_parse(from_spec).hexsha
461474
to_commit_rev = repo.rev_parse(to_spec).hexsha
462-
return from_commit_rev, to_commit_rev
475+
return from_commit_rev, to_commit_rev, separator
463476
except git_proxy.get_git_command_error() as e:
464477
logger.warning("Failed to parse commit range '%s'", commit_range, exc_info=e)
465-
return None, None
478+
return None, None, None
479+
480+
481+
def normalize_commit_range(commit_range: str, path: str) -> str:
482+
"""Normalize a commit range string to handle various formats consistently with all scan types.
483+
484+
Returns:
485+
A normalized commit range string suitable for Git operations (e.g., 'full_sha1..full_sha2')
486+
"""
487+
from_commit_rev, to_commit_rev, separator = parse_commit_range(commit_range, path)
488+
if from_commit_rev is None or to_commit_rev is None:
489+
logger.warning('Failed to parse commit range "%s", falling back to raw string.', commit_range)
490+
return commit_range
491+
492+
# Construct a normalized range string using the original separator for iter_commits
493+
normalized_commit_range = f'{from_commit_rev}{separator}{to_commit_rev}'
494+
logger.debug(
495+
'Normalized commit range "%s" to "%s"',
496+
commit_range,
497+
normalized_commit_range,
498+
)
499+
return normalized_commit_range

tests/cli/files_collector/test_commit_range_documents.py

Lines changed: 66 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
_get_default_branches_for_merge_base,
1414
calculate_pre_push_commit_range,
1515
calculate_pre_receive_commit_range,
16+
collect_commit_range_diff_documents,
1617
get_diff_file_path,
1718
get_safe_head_reference_for_diff,
1819
parse_commit_range,
@@ -846,40 +847,40 @@ def test_two_dot_linear_history(self) -> None:
846847
with temporary_git_repository() as (temp_dir, repo):
847848
a, b, c = self._make_linear_history(repo, temp_dir)
848849

849-
parsed_from, parsed_to = parse_commit_range(f'{a}..{c}', temp_dir)
850-
assert (parsed_from, parsed_to) == (a, c)
850+
parsed_from, parsed_to, separator = parse_commit_range(f'{a}..{c}', temp_dir)
851+
assert (parsed_from, parsed_to, separator) == (a, c, '..')
851852

852853
def test_three_dot_linear_history(self) -> None:
853854
"""For 'A...C' in linear history, expect (A,C)."""
854855
with temporary_git_repository() as (temp_dir, repo):
855856
a, b, c = self._make_linear_history(repo, temp_dir)
856857

857-
parsed_from, parsed_to = parse_commit_range(f'{a}...{c}', temp_dir)
858-
assert (parsed_from, parsed_to) == (a, c)
858+
parsed_from, parsed_to, separator = parse_commit_range(f'{a}...{c}', temp_dir)
859+
assert (parsed_from, parsed_to, separator) == (a, c, '...')
859860

860861
def test_open_right_linear_history(self) -> None:
861862
"""For 'A..', expect (A,HEAD=C)."""
862863
with temporary_git_repository() as (temp_dir, repo):
863864
a, b, c = self._make_linear_history(repo, temp_dir)
864865

865-
parsed_from, parsed_to = parse_commit_range(f'{a}..', temp_dir)
866-
assert (parsed_from, parsed_to) == (a, c)
866+
parsed_from, parsed_to, separator = parse_commit_range(f'{a}..', temp_dir)
867+
assert (parsed_from, parsed_to, separator) == (a, c, '..')
867868

868869
def test_open_left_linear_history(self) -> None:
869870
"""For '..C' where HEAD==C, expect (HEAD=C,C)."""
870871
with temporary_git_repository() as (temp_dir, repo):
871872
a, b, c = self._make_linear_history(repo, temp_dir)
872873

873-
parsed_from, parsed_to = parse_commit_range(f'..{c}', temp_dir)
874-
assert (parsed_from, parsed_to) == (c, c)
874+
parsed_from, parsed_to, separator = parse_commit_range(f'..{c}', temp_dir)
875+
assert (parsed_from, parsed_to, separator) == (c, c, '..')
875876

876877
def test_single_commit_spec(self) -> None:
877878
"""For 'A', expect (A,HEAD=C)."""
878879
with temporary_git_repository() as (temp_dir, repo):
879880
a, b, c = self._make_linear_history(repo, temp_dir)
880881

881-
parsed_from, parsed_to = parse_commit_range(a, temp_dir)
882-
assert (parsed_from, parsed_to) == (a, c)
882+
parsed_from, parsed_to, separator = parse_commit_range(a, temp_dir)
883+
assert (parsed_from, parsed_to, separator) == (a, c, '..')
883884

884885

885886
class TestParsePreReceiveInput:
@@ -1047,3 +1048,58 @@ def test_initial_oldest_commit_without_parent_with_two_commits_returns_single_co
10471048
work_repo.close()
10481049
finally:
10491050
server_repo.close()
1051+
1052+
1053+
class TestCollectCommitRangeDiffDocuments:
1054+
"""Test the collect_commit_range_diff_documents function with various commit range formats."""
1055+
1056+
def test_collect_with_various_commit_range_formats(self) -> None:
1057+
"""Test that different commit range formats are normalized and work correctly."""
1058+
with temporary_git_repository() as (temp_dir, repo):
1059+
# Create three commits
1060+
a_file = os.path.join(temp_dir, 'a.txt')
1061+
with open(a_file, 'w') as f:
1062+
f.write('A')
1063+
repo.index.add(['a.txt'])
1064+
a_commit = repo.index.commit('A')
1065+
1066+
with open(a_file, 'a') as f:
1067+
f.write('B')
1068+
repo.index.add(['a.txt'])
1069+
b_commit = repo.index.commit('B')
1070+
1071+
with open(a_file, 'a') as f:
1072+
f.write('C')
1073+
repo.index.add(['a.txt'])
1074+
c_commit = repo.index.commit('C')
1075+
1076+
# Create mock context
1077+
mock_ctx = Mock()
1078+
mock_progress_bar = Mock()
1079+
mock_progress_bar.set_section_length = Mock()
1080+
mock_progress_bar.update = Mock()
1081+
mock_ctx.obj = {'progress_bar': mock_progress_bar}
1082+
1083+
# Test two-dot range - should collect documents from commits B and C (2 commits, 2 documents)
1084+
commit_range = f'{a_commit.hexsha}..{c_commit.hexsha}'
1085+
documents = collect_commit_range_diff_documents(mock_ctx, temp_dir, commit_range)
1086+
assert len(documents) == 2, f'Expected 2 documents from range A..C, got {len(documents)}'
1087+
commit_ids_in_documents = {doc.unique_id for doc in documents if doc.unique_id}
1088+
assert b_commit.hexsha in commit_ids_in_documents
1089+
assert c_commit.hexsha in commit_ids_in_documents
1090+
1091+
# Test three-dot range - should collect documents from commits B and C (2 commits, 2 documents)
1092+
commit_range = f'{a_commit.hexsha}...{c_commit.hexsha}'
1093+
documents = collect_commit_range_diff_documents(mock_ctx, temp_dir, commit_range)
1094+
assert len(documents) == 2, f'Expected 2 documents from range A...C, got {len(documents)}'
1095+
1096+
# Test parent notation with three-dot - should collect document from commit C (1 commit, 1 document)
1097+
commit_range = f'{c_commit.hexsha}~1...{c_commit.hexsha}'
1098+
documents = collect_commit_range_diff_documents(mock_ctx, temp_dir, commit_range)
1099+
assert len(documents) == 1, f'Expected 1 document from range C~1...C, got {len(documents)}'
1100+
assert documents[0].unique_id == c_commit.hexsha
1101+
1102+
# Test single commit spec - should be interpreted as A..HEAD (commits B and C, 2 documents)
1103+
commit_range = a_commit.hexsha
1104+
documents = collect_commit_range_diff_documents(mock_ctx, temp_dir, commit_range)
1105+
assert len(documents) == 2, f'Expected 2 documents from single commit A, got {len(documents)}'

0 commit comments

Comments
 (0)