AISE-TUDelft
diff --git a/‎data/README.md‎
Lines changed: 31 additions & 0 deletions b/‎data/README.md‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎data/filtering/filter.py‎
Lines changed: 162 additions & 0 deletions b/‎data/filtering/filter.py‎
Lines changed: 162 additions & 0 deletions
@@ -0,0 +1,31 @@
+# Data Directory Documentation
+
+This repository contains scripts and tools for dataset filtering and sample creation, organized into two main directories.
+
+## Directory Structure
+
+```
+data/
+├── filtering/
+└── samples-creation/
+```
+
+## Filtering Directory
+
+Contains the script for dataset filtering and sampling to create the fine-tuning dataset.
+
+### Datasets
+- **Fine-tuning Dataset**: Available on HuggingFace at [AISE-TUDelft/memtune-tuning_data](https://huggingface.co/datasets/AISE-TUDelft/memtune-tuning_data)
+- **Source Dataset**: The original dataset that underwent filtering is available at [AISE-TUDelft/the-heap](https://huggingface.co/datasets/AISE-TUDelft/the-heap)
+
+## Sample Creation Directory
+
+Contains scripts for generating data extraction benchmarks used in data extraction attacks.
+
+### Scripts
+- `sample_identification_mem.py`: Generates benchmark dataset for the **fine-tuning code attack**
+- `sample_identification_forget.py`: Generates benchmark dataset for the **pre-training code attack**
+  - Note: Requires downloading the [TheStackV2 Java subset](https://huggingface.co/datasets/bigcode/the-stack-v2-dedup/viewer/Java)
+
+### Benchmarks
+The generated data extraction benchmarks are available at [AISE-TUDelft/memtune-data_attack](https://huggingface.co/datasets/AISE-TUDelft/memtune-data_attack)
@@ -0,0 +1,162 @@
+import torch 
+from datasets import load_dataset
+from transformers import AutoTokenizer
+
+# parallel processing
+from pandarallel import pandarallel
+pandarallel.initialize(progress_bar=True, nb_workers=16)
+from tqdm import tqdm
+tqdm.pandas()
+
+# utility
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import re
+import os
+
+seed = 42
+n_train = 20000
+n_valid = 1000
+n_test = 10000
+n_samples = n_train + n_valid + n_test # 31000
+hf_dir = "AISE-TUDelft/the-heap"
+
+# Filter 0, filtering out files with less than 300 tokens
+# Load the tokenizer from HF-hub
+checkpoint = "bigcode/starcoder2-3b"
+tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+# Tokenize the sequences
+# keep in mind of truncation 
+# in this case avoid padding! Since we don't want to sample place holders!
+def tokenize_input(batch):
+    return tokenizer(batch['content'], padding = 'do_not_pad',  return_tensors='pt')
+
+# Filters:
+def longline_filter(file):
+    """
+    input: a file string
+    output: a boolean, True if the file passes the filter, False if not
+    """
+    # remove double space and split the file into a list of lines
+    lines = file.replace('\n\n', '\n').split('\n')
+    # if the number of lines overcome 100k filter out
+    if len(lines) >= 100000:
+        return False
+    # if the n lines is below 100k we have the second filter:
+    # remove files with maximum line length of more than 1000 characters and
+    # an average line length of more than 100 characters
+    line_count = [len(line)for _,line in enumerate(lines) if len(line) <= 1000]
+    if line_count == []:
+        return False
+    else: 
+        return np.mean(line_count) <= 100 
+
+def alpha_filter(file):
+    """
+    input: a file string
+    output: the percentage of alphabetic characters
+    """
+    total_chars = len(file)
+    if total_chars == 0:
+        return 0
+    alpha_chars = sum(c.isalpha() for c in file)
+    return (alpha_chars / total_chars) * 100
+
+base64_regex = re.compile(r'[a-zA-Z0-9+/\n=]{64,}')
+hex_regex = re.compile(r'(?:\b(?:0x|\\x)?[0-9a-fA-F]{2}(?:,|\b\s*)){8,}')
+unicode_regex = re.compile(r'(?:\\u[0-9a-fA-F]{4}){8,}')
+
+def encoded_data_filter (file):
+    total_length = len(file)
+    """
+    input: a file string
+    output: a boolean, True if the file passes the filter, False if not
+    """
+    # Find all matches of the regex patterns
+    base64_matches = base64_regex.findall(file)
+    hex_matches = hex_regex.findall(file)
+    unicode_matches = unicode_regex.findall(file)
+    
+    # Concatenate all matches into one list
+    all_matches = base64_matches + hex_matches + unicode_matches
+    
+    # Calculate the total length of all matched strings
+    matched_length = sum(len(match) for match in all_matches)
+    
+    # Check if any match exceeds 1024 characters or if matched fraction is more than 50%
+    if any(len(match) > 1024 for match in all_matches) or (matched_length / total_length > 0.5):
+        return False
+        
+    return True
+
+# following TheStackV2 paper. Remove files classified as auto-generated by the is_generated function of go-enry
+# go-enry java regexes
+auto_gen1 = re.compile(r'Generated by the protocol buffer compiler\.  DO NOT EDIT!')
+auto_gen2 = re.compile(r'Autogenerated by Thrift Compiler')
+auto_gen3 = re.compile(r'/* The following code was generated by JFlex ') 
+auto_gen4 = re.compile(r'// This is a generated file\. Not intended for manual editing.')
+auto_gen5 = re.compile(r'Generated by Haxe')
+auto_gen6 = re.compile(r'This file is generated by jOOQ.')
+# additional regex (implemented by TheStackV2)
+auto_gen7 = re.compile(r'auto-?generated|automatically\s*generated|generated\s*automatically|this\s*file\s*is\s*generated')
+# pattern for repetitive lines (added by me)
+auto_hen8 = re.compile(r'(.*)\1{3,}')
+
+def autogen(file):
+    match1 = auto_gen1.findall(file)
+    match2 = auto_gen2.findall(file)
+    match3 = auto_gen3.findall(file)
+    match4 = auto_gen4.findall(file)
+    match5 = auto_gen5.findall(file)
+    match6 = auto_gen6.findall(file)
+    match7 = auto_gen7.findall(file)
+
+    all_matches = match1 + match2 + match3 + match4 + match5 + match6 + match7
+    
+    if all_matches == []:
+        return True
+    else: False
+
+# load the dataset
+dataset = load_dataset(hf_dir, split= 'train')
+# Filter out the files near duplicate with at least one file from TheStackV2.
+dataset = dataset.filter(lambda sample: len(sample["near_dups_stkv2_idx"]) == 0)
+# tokenize
+dataset = dataset.map(tokenize_input, batched=False, num_proc=64)
+
+# Filter 0: remove files with less than 300 tokens
+# convert to pandas df
+df = dataset.to_pandas()
+# I have a list of lists and I want to get rid of that
+df['input_ids'] = df['input_ids'].progress_apply(lambda x: x[0])
+# filter out the input with a number of tokens > 300
+df['n_tok'] = df['input_ids'].progress_apply(len)
+df = df[df['n_tok'] > 300 ]
+
+# Longline filter
+df_try = df.copy()
+df_try['longline'] = df_try.progress_apply(lambda x: longline_filter(x['content']), axis=1)
+# Alpha filter
+df_try['alpha'] = df_try.progress_apply(lambda x: alpha_filter(x['content']), axis=1)
+# Encoded data filter
+df_try['encoded'] = df_try.progress_apply(lambda x: encoded_data_filter(x['content']), axis=1)
+# Autogen filter
+df_try['autogen'] = df_try.progress_apply(lambda x: autogen(x['content']), axis=1)
+
+# Filter out the files that pass all the filters
+df_filtered = df_try[(df_try['longline'] == True) | (df_try['alpha'] > 25) | (df_try['encoded'] == True) | (df_try['autogen'] == True)]
+
+# Sampling
+df_sampled = df_filtered.sample(n = n_samples, replace = False, random_state = seed) 
+df_train = df_sampled.iloc[0:n_train] 
+df_valid = df_sampled.iloc[n_train:n_train+n_valid]
+df_test = df_sampled.iloc[n_train+n_valid:]
+
+print(f" train: {df_train.shape}\n valid:{df_valid.shape}\n test:{df_test.shape} ")
+
+# Saving the files
+# You can download these files from this link: https://huggingface.co/datasets/AISE-TUDelft/memtune-tuning_data
+df_train.to_parquet('./train_java.parquet', index = False)
+df_valid.to_parquet('./valid_java.parquet', index = False)
+df_test.to_parquet('./test_java.parquet', index = False)