Skip to content

Commit 01961fe

Browse files
committed
Remove the unnecessary SDG class
A pipeline chains together a sequence of blocks, and an SDG chains together a sequence of pipelines. There is no need for this additional layer - we can construct a pipeline with the full sequence of blocks, and not chain pipelines together. Signed-off-by: Mark McLoughlin <markmc@redhat.com>
1 parent ca30d98 commit 01961fe

File tree

6 files changed

+13
-41
lines changed

6 files changed

+13
-41
lines changed

scripts/test_freeform_skills.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
from openai import OpenAI
77

88
# First Party
9-
from src.instructlab.sdg import SDG
109
from src.instructlab.sdg.pipeline import (
1110
FULL_PIPELINES_PACKAGE,
1211
Pipeline,
@@ -60,8 +59,7 @@
6059
with resources.path(FULL_PIPELINES_PACKAGE, "freeform_skills.yaml") as yaml_path:
6160
skills_pipe = Pipeline.from_file(ctx, yaml_path)
6261

63-
sdg = SDG([skills_pipe])
64-
gen_data = sdg.generate(ds)
62+
gen_data = skills_pipe.generate(ds)
6563

6664
print(gen_data)
6765
print(gen_data[0])

scripts/test_grounded_skills.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
from openai import OpenAI
77

88
# First Party
9-
from src.instructlab.sdg import SDG
109
from src.instructlab.sdg.pipeline import (
1110
FULL_PIPELINES_PACKAGE,
1211
Pipeline,
@@ -108,8 +107,7 @@
108107
with resources.path(FULL_PIPELINES_PACKAGE, "grounded_skills.yaml") as yaml_path:
109108
skills_pipe = Pipeline.from_file(ctx, yaml_path)
110109

111-
sdg = SDG([skills_pipe])
112-
gen_data = sdg.generate(ds)
110+
gen_data = skills_pipe.generate(ds)
113111

114112
print(gen_data)
115113
print(gen_data[0])

scripts/test_knowledge.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
from openai import OpenAI
88

99
# First Party
10-
from src.instructlab.sdg import SDG
1110
from src.instructlab.sdg.pipeline import (
1211
FULL_PIPELINES_PACKAGE,
1312
Pipeline,
@@ -47,8 +46,7 @@
4746
with resources.path(FULL_PIPELINES_PACKAGE, "knowledge.yaml") as yaml_path:
4847
knowledge_pipe = Pipeline.from_file(ctx, yaml_path)
4948

50-
sdg = SDG([knowledge_pipe])
51-
mmlubench_data = sdg.generate(ds)
49+
mmlubench_data = knowledge_pipe.generate(ds)
5250

5351
print(mmlubench_data)
5452
print(mmlubench_data[0])

src/instructlab/sdg/__init__.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
"SamplePopulatorBlock",
2222
"SelectorBlock",
2323
"SetToMajorityValueBlock",
24-
"SDG",
2524
"SIMPLE_PIPELINES_PACKAGE",
2625
"FULL_PIPELINES_PACKAGE",
2726
"generate_data",
@@ -42,7 +41,6 @@
4241
PipelineConfigParserError,
4342
PipelineContext,
4443
)
45-
from .sdg import SDG
4644
from .utilblocks import (
4745
CombineColumnsBlock,
4846
DuplicateColumnsBlock,

src/instructlab/sdg/generate_data.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@
3030
Pipeline,
3131
PipelineContext,
3232
)
33-
from instructlab.sdg.sdg import SDG
3433
from instructlab.sdg.utils import GenerateException, models
3534
from instructlab.sdg.utils.taxonomy import (
3635
leaf_node_to_samples,
@@ -241,9 +240,9 @@ def load_pipeline(yaml_basename):
241240
return Pipeline.from_file(ctx, os.path.join(pipeline, yaml_basename))
242241

243242
return (
244-
SDG([load_pipeline("knowledge.yaml")]),
245-
SDG([load_pipeline("freeform_skills.yaml")]),
246-
SDG([load_pipeline("grounded_skills.yaml")]),
243+
load_pipeline("knowledge.yaml"),
244+
load_pipeline("freeform_skills.yaml"),
245+
load_pipeline("grounded_skills.yaml"),
247246
)
248247

249248

@@ -361,7 +360,9 @@ def generate_data(
361360
batch_num_workers=num_cpus,
362361
)
363362

364-
sdg_knowledge, sdg_freeform_skill, sdg_grounded_skill = _sdg_init(ctx, pipeline)
363+
knowledge_pipe, freeform_skills_pipe, grounded_skills_pipe = _sdg_init(
364+
ctx, pipeline
365+
)
365366

366367
# Make sure checkpointing is disabled (we don't want this pipeline to load checkpoints from the main pipeline)
367368
mmlu_ctx = dataclasses.replace(ctx, checkpoint_dir=None)
@@ -384,19 +385,19 @@ def generate_data(
384385
raise GenerateException("Error: No samples found in leaf node.")
385386

386387
if samples[0].get("document"):
387-
sdg = sdg_knowledge
388+
pipe = knowledge_pipe
388389
is_knowledge = True
389390

390391
elif samples[0].get("seed_context"):
391-
sdg = sdg_grounded_skill
392+
pipe = grounded_skills_pipe
392393

393394
else:
394-
sdg = sdg_freeform_skill
395+
pipe = freeform_skills_pipe
395396

396397
logger.debug("Samples: %s", samples)
397398
ds = Dataset.from_list(samples)
398399
logger.debug("Dataset: %s", ds)
399-
new_generated_data = sdg.generate(ds)
400+
new_generated_data = pipe.generate(ds)
400401
if len(new_generated_data) == 0:
401402
raise EmptyDatasetError(
402403
"Pipeline stopped: Empty dataset after running pipe"

src/instructlab/sdg/sdg.py

Lines changed: 0 additions & 21 deletions
This file was deleted.

0 commit comments

Comments
 (0)