1+ """
2+ This script is used to fine-tune StarCoder2 family models on a java dataset, for code completion task.
3+ """
4+ import torch
5+ from datasets import load_dataset , disable_caching
6+ from transformers import AutoTokenizer , AutoModelForCausalLM , Trainer , TrainingArguments , DataCollatorForLanguageModeling , set_seed
7+
8+ # parallel processing
9+ from pandarallel import pandarallel
10+ pandarallel .initialize (progress_bar = True , nb_workers = 16 )
11+ from tqdm import tqdm
12+ tqdm .pandas ()
13+
14+ # utility
15+ import pandas as pd
16+ import numpy as np
17+ import matplotlib .pyplot as plt
18+ import re
19+ import os
20+
21+ """
22+ Setting the variables.
23+ """
24+
25+ disable_caching ()
26+
27+ set_seed (42 )
28+
29+ wproject = "name" # wb project name
30+ run_name = "run-name" # name of the W&B run (optional)
31+ # training batches
32+ batch = 5
33+ # Load base-model and tokenizer from HF-hub
34+ checkpoint = "bigcode/starcoder2-15b"
35+ # Select the column of interest from the dataset
36+ text_column = 'content'
37+
38+ # training
39+ max_length = 1024
40+ # model parallel
41+ device_map = 'auto'
42+
43+ #wandb setup
44+ import wandb
45+ wandb .login ()
46+ os .environ ["WANDB_PROJECT" ] = wproject # wandb project name
47+
48+ """
49+ Loading the model and tokenizer
50+ """
51+ # tokenizer
52+ tokenizer = AutoTokenizer .from_pretrained (checkpoint )
53+ tokenizer .pad_token = tokenizer .eos_token # setting the pad token to the end of sequence token
54+
55+ # model
56+ model = AutoModelForCausalLM .from_pretrained (
57+ checkpoint ,
58+ device_map = device_map )
59+
60+
61+ """"
62+ Loading and preprocessing the data
63+ """
64+ # LINK FOR THE DATASET: https://huggingface.co/datasets/AISE-TUDelft/memtune-tuning_data
65+ # Load the data
66+ dataset_train_20 = load_dataset ("AISE-TUDelft/memtune-tuning_data" , name = "20k" , split = 'train' )
67+ dataset_valid_20 = load_dataset ("AISE-TUDelft/memtune-tuning_data" , name = "20k" , split = 'valid' )
68+
69+ # Pick the columns of interest
70+ train_20 = dataset_train_20 ['train' ].select_columns (text_column )
71+ validation_20 = dataset_valid_20 ['valid' ].select_columns (text_column )
72+
73+ # Tokenize the sequences
74+ # Note: StarCoder2 has a context lenght of 8,000 tokens,
75+ def tokenize_input (batch ):
76+ return tokenizer (batch [text_column ], padding = "max_length" , truncation = True , max_length = max_length , return_tensors = 'pt' )
77+
78+ training_20 = train_20 .map (tokenize_input , batched = True , num_proc = 64 , remove_columns = text_column )
79+ validating_20 = validation_20 .map (tokenize_input , batched = True , num_proc = 64 ,remove_columns = text_column )
80+
81+ """
82+ Training initialization
83+ """
84+ # Data collator
85+ data_collator = DataCollatorForLanguageModeling (
86+ tokenizer = tokenizer ,
87+ mlm = False ,
88+ return_tensors = 'pt'
89+ )
90+
91+
92+ # Args
93+ output_dir = "./epochs"
94+ overwrite_output_dir = False
95+
96+ per_device_train_batch_size = batch
97+ per_device_eval_batch_size = batch
98+ gradient_accumulation_steps = 5
99+
100+ optim = "adafactor"
101+ adam_beta1 = 0.9
102+ weight_decay = 0.1
103+
104+ learning_rate = 3e-5
105+ lr_scheduler_type = "linear"
106+ warmup_steps = 50
107+
108+ num_train_epochs = 3
109+ eval_steps = 0.08 #200 # each epoch two evaluations
110+ eval_strategy = "steps" # default is "no"
111+ save_strategy = "epoch" # default is "steps"
112+
113+ logging_steps = 1
114+ report_to = "wandb"
115+
116+ # Training arguments
117+ training_args = TrainingArguments (
118+ output_dir = output_dir ,
119+ overwrite_output_dir = overwrite_output_dir ,
120+ save_strategy = save_strategy ,
121+ eval_strategy = eval_strategy ,
122+
123+ num_train_epochs = num_train_epochs ,
124+ per_device_train_batch_size = per_device_train_batch_size ,
125+ gradient_accumulation_steps = gradient_accumulation_steps ,
126+
127+ per_device_eval_batch_size = per_device_eval_batch_size ,
128+ eval_steps = eval_steps ,
129+
130+ optim = optim ,
131+ adam_beta1 = adam_beta1 ,
132+ weight_decay = weight_decay ,
133+
134+ learning_rate = learning_rate ,
135+ lr_scheduler_type = lr_scheduler_type ,
136+ warmup_steps = warmup_steps ,
137+
138+ logging_steps = logging_steps ,
139+ report_to = report_to ,
140+ run_name = run_name ,
141+ seed = 42 )
142+
143+ trainer = Trainer (
144+ model = model ,
145+ args = training_args ,
146+ data_collator = data_collator ,
147+ train_dataset = training_20 ,
148+ eval_dataset = validating_20
149+ )
150+
151+ # Training
152+ trainer .train ()
0 commit comments