@@ -127,6 +127,7 @@ ASTNode getAnASTNodeWithAFeature(Function f) {
127127 result = getAnASTNodeToFeaturize ( f )
128128}
129129
130+ /** Returns the number of source-code characters in a function. */
130131int getNumCharsInFunction ( Function f ) {
131132 result =
132133 strictsum ( ASTNode node | node = getAnASTNodeWithAFeature ( f ) | getTokenizedAstNode ( node ) .length ( ) )
@@ -135,10 +136,6 @@ int getNumCharsInFunction(Function f) {
135136// Evaluator string limit is 5395415 characters. We choose a limit lower than this.
136137private int getMaxChars ( ) { result = 1000000 }
137138
138- Function getFeaturizableFunction ( Function f ) {
139- result = f and getNumCharsInFunction ( f ) <= getMaxChars ( )
140- }
141-
142139/**
143140 * Returns a featurized representation of the function that can be used to populate the
144141 * `enclosingFunctionBody` feature for an endpoint.
@@ -147,13 +144,15 @@ string getBodyTokensFeature(Function function) {
147144 // Performance optimization: If a function has more than 256 body subtokens, then featurize it as
148145 // absent. This approximates the behavior of the classifer on non-generic body features where
149146 // large body features are replaced by the absent token.
147+ //
148+ // We count nodes instead of tokens because tokens are often not unique.
150149 strictcount ( ASTNode node |
151150 node = getAnASTNodeToFeaturize ( function ) and
152151 exists ( getTokenizedAstNode ( node ) )
153152 ) <= 256 and
154153 // Performance optimization: If a function has more than getMaxChars() characters in its body subtokens,
155154 // then featurize it as absent.
156- function = getFeaturizableFunction ( function ) and
155+ getNumCharsInFunction ( function ) <= getMaxChars ( ) and
157156 result =
158157 strictconcat ( Location l , string token |
159158 // The use of a nested exists here allows us to avoid duplicates due to two AST nodes in the
0 commit comments