Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 53 additions & 11 deletions crates/chat-cli/src/auth/builder_id.rs
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,10 @@ impl BuilderIdToken {
}

/// Load the token from the keychain, refresh the token if it is expired and return it
pub async fn load(database: &Database) -> Result<Option<Self>, AuthError> {
pub async fn load(
database: &Database,
telemetry: Option<&crate::telemetry::TelemetryThread>,
) -> Result<Option<Self>, AuthError> {
// Can't use #[cfg(test)] without breaking lints, and we don't want to require
// authentication in order to run ChatSession tests. Hence, adding this here with cfg!(test)
if cfg!(test) {
Expand All @@ -328,7 +331,7 @@ impl BuilderIdToken {

if token.is_expired() {
trace!("token is expired, refreshing");
token.refresh_token(&client, database, &region).await
token.refresh_token(&client, database, &region, telemetry).await
} else {
trace!(?token, "found a valid token");
Ok(Some(token))
Expand Down Expand Up @@ -357,6 +360,7 @@ impl BuilderIdToken {
client: &Client,
database: &Database,
region: &Region,
telemetry: Option<&crate::telemetry::TelemetryThread>,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this need to be an Option type? Do we not want to always emit something when token refresh fails?

Copy link
Contributor Author

@abhraina-aws abhraina-aws Oct 29, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Its because of the load function. There are cases when we can pass None to it for telemetry because its not a login failure case. And hence that gets passed down. So we just implemented it to be able to handle a None for telemetry.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh alright. I'll let the second PR chime in here. Not familiar enough with the auth flow to have more insights here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is more like a code arrangement thing now. Either I handle the None in one place vs the other.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

imo, instead of making this an Option, you can just check if there's an error and only then publish the auth-failed?

Also, do you wanna publish an auth-succeeded metric?

Copy link
Contributor Author

@abhraina-aws abhraina-aws Oct 29, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So the issue here is that load is being called in a-lot of places which may not necessarily need telemetry publish. So instead of adding the code everywhere to have telemetry to pass, I am allowing a None. We are already only posting auth_failed only when we have an error.

We already have a login successful metric. So that covers it.

) -> Result<Option<Self>, AuthError> {
let Some(refresh_token) = &self.refresh_token else {
warn!("no refresh token was found");
Expand Down Expand Up @@ -416,6 +420,25 @@ impl BuilderIdToken {
let display_err = DisplayErrorContext(&err);
error!("Failed to refresh builder id access token: {}", display_err);

// Send telemetry for refresh failure
if let Some(telemetry) = telemetry {
let auth_method = match self.token_type() {
TokenType::BuilderId => "BuilderId",
TokenType::IamIdentityCenter => "IdentityCenter",
};
let oauth_flow = match self.oauth_flow {
OAuthFlow::DeviceCode => "DeviceCode",
OAuthFlow::Pkce => "PKCE",
};
let error_code = match &err {
SdkError::ServiceError(service_err) => service_err.err().meta().code().map(|s| s.to_string()),
_ => None,
};
telemetry
.send_auth_failed(auth_method, oauth_flow, "TokenRefresh", error_code)
.ok();
}

// if the error is the client's fault, clear the token
if let SdkError::ServiceError(service_err) = &err {
if !service_err.err().is_slow_down_exception() {
Expand Down Expand Up @@ -471,11 +494,7 @@ impl BuilderIdToken {
}

pub fn token_type(&self) -> TokenType {
match &self.start_url {
Some(url) if url == START_URL => TokenType::BuilderId,
None => TokenType::BuilderId,
Some(_) => TokenType::IamIdentityCenter,
}
token_type_from_start_url(self.start_url.as_deref())
}

/// Check if the token is for the internal amzn start URL (`https://amzn.awsapps.com/start`),
Expand All @@ -486,6 +505,14 @@ impl BuilderIdToken {
}
}

pub fn token_type_from_start_url(start_url: Option<&str>) -> TokenType {
Copy link
Contributor

@dingfeli dingfeli Oct 29, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe just implement an From<Option<&str>> for TokenType here. We own the definition of TokenType in this crate so you don't have to worry about orphan rule.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are right. Will take care of this.

match start_url {
Some(url) if url == START_URL => TokenType::BuilderId,
None => TokenType::BuilderId,
Some(_) => TokenType::IamIdentityCenter,
}
}

pub enum PollCreateToken {
Pending,
Complete,
Expand All @@ -498,6 +525,7 @@ pub async fn poll_create_token(
device_code: String,
start_url: Option<String>,
region: Option<String>,
telemetry: &crate::telemetry::TelemetryThread,
) -> PollCreateToken {
let region = region.clone().map_or(OIDC_BUILDER_ID_REGION, Region::new);
let client = client(region.clone());
Expand Down Expand Up @@ -538,6 +566,20 @@ pub async fn poll_create_token(
},
Err(err) => {
error!(?err, "Failed to poll for builder id token");

// Send telemetry for device code failure
let auth_method = match token_type_from_start_url(start_url.as_deref()) {
TokenType::BuilderId => "BuilderId",
TokenType::IamIdentityCenter => "IdentityCenter",
};
let error_code = match &err {
SdkError::ServiceError(service_err) => service_err.err().meta().code().map(|s| s.to_string()),
_ => None,
};
telemetry
.send_auth_failed(auth_method, "DeviceCode", "NewLogin", error_code)
.ok();

PollCreateToken::Error(err.into())
},
}
Expand All @@ -550,7 +592,7 @@ pub async fn is_logged_in(database: &mut Database) -> bool {
return true;
}

match BuilderIdToken::load(database).await {
match BuilderIdToken::load(database, None).await {
Ok(Some(_)) => true,
Ok(None) => {
info!("not logged in - no valid token found");
Expand Down Expand Up @@ -585,7 +627,7 @@ pub async fn logout(database: &mut Database) -> Result<(), AuthError> {
pub async fn get_start_url_and_region(database: &Database) -> (Option<String>, Option<String>) {
// NOTE: Database provides direct methods to access the start_url and region, but they are not
// guaranteed to be up to date in the chat session. Example: login is changed mid-chat session.
let token = BuilderIdToken::load(database).await;
let token = BuilderIdToken::load(database, None).await;
match token {
Ok(Some(t)) => (t.start_url, t.region),
_ => (None, None),
Expand All @@ -603,7 +645,7 @@ impl ResolveIdentity for BearerResolver {
) -> IdentityFuture<'a> {
IdentityFuture::new_boxed(Box::pin(async {
let database = Database::new().await?;
match BuilderIdToken::load(&database).await? {
match BuilderIdToken::load(&database, None).await? {
Some(token) => Ok(Identity::new(
Token::new(token.access_token.0.clone(), Some(token.expires_at.into())),
Some(token.expires_at.into()),
Expand All @@ -618,7 +660,7 @@ pub async fn is_idc_user(database: &Database) -> Result<bool> {
if cfg!(test) {
return Ok(false);
}
if let Ok(Some(token)) = BuilderIdToken::load(database).await {
if let Ok(Some(token)) = BuilderIdToken::load(database, None).await {
Ok(token.token_type() == TokenType::IamIdentityCenter)
} else {
Err(eyre!("No auth token found - is the user signed in?"))
Expand Down
16 changes: 13 additions & 3 deletions crates/chat-cli/src/cli/user.rs
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,16 @@ impl LoginArgs {
]);
let ctrl_c_stream = ctrl_c();
tokio::select! {
res = registration.finish(&client, Some(&mut os.database)) => res?,
res = registration.finish(&client, Some(&mut os.database)) => {
if let Err(err) = res {
let auth_method = match crate::auth::builder_id::token_type_from_start_url(start_url.as_deref()) {
crate::auth::builder_id::TokenType::BuilderId => "BuilderId",
crate::auth::builder_id::TokenType::IamIdentityCenter => "IdentityCenter",
};
os.telemetry.send_auth_failed(auth_method, "PKCE", "NewLogin", None).ok();
return Err(err.into());
}
},
Ok(_) = ctrl_c_stream => {
#[allow(clippy::exit)]
exit(1);
Expand Down Expand Up @@ -194,7 +203,7 @@ pub struct WhoamiArgs {

impl WhoamiArgs {
pub async fn execute(self, os: &mut Os) -> Result<ExitCode> {
let builder_id = BuilderIdToken::load(&os.database).await;
let builder_id = BuilderIdToken::load(&os.database, Some(&os.telemetry)).await;

match builder_id {
Ok(Some(token)) => {
Expand Down Expand Up @@ -245,7 +254,7 @@ pub enum LicenseType {
}

pub async fn profile(os: &mut Os) -> Result<ExitCode> {
if let Ok(Some(token)) = BuilderIdToken::load(&os.database).await {
if let Ok(Some(token)) = BuilderIdToken::load(&os.database, Some(&os.telemetry)).await {
if matches!(token.token_type(), TokenType::BuilderId) {
bail!("This command is only available for Pro users");
}
Expand Down Expand Up @@ -314,6 +323,7 @@ async fn try_device_authorization(os: &mut Os, start_url: Option<String>, region
device_auth.device_code.clone(),
start_url.clone(),
region.clone(),
&os.telemetry,
)
.await
{
Expand Down
25 changes: 25 additions & 0 deletions crates/chat-cli/src/telemetry/core.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ use crate::telemetry::definitions::metrics::{
CodewhispererterminalAddChatMessage,
CodewhispererterminalAgentConfigInit,
CodewhispererterminalAgentContribution,
CodewhispererterminalAuthFailed,
CodewhispererterminalChatSlashCommandExecuted,
CodewhispererterminalCliSubcommandExecuted,
CodewhispererterminalMcpServerInit,
Expand Down Expand Up @@ -500,6 +501,24 @@ impl Event {
}
.into_metric_datum(),
),
EventType::AuthFailed {
auth_method,
oauth_flow,
error_type,
error_code,
} => Some(
CodewhispererterminalAuthFailed {
create_time: self.created_time,
value: None,
credential_start_url: self.credential_start_url.map(Into::into),
codewhispererterminal_in_cloudshell: None,
codewhispererterminal_auth_method: Some(auth_method.into()),
oauth_flow: Some(oauth_flow.into()),
codewhispererterminal_error_type: Some(error_type.into()),
codewhispererterminal_error_code: error_code.map(Into::into),
}
.into_metric_datum(),
),
EventType::DailyHeartbeat {} => Some(
AmazonqcliDailyHeartbeat {
create_time: self.created_time,
Expand Down Expand Up @@ -594,6 +613,12 @@ pub struct AgentConfigInitArgs {
#[serde(tag = "type")]
pub enum EventType {
UserLoggedIn {},
AuthFailed {
auth_method: String,
oauth_flow: String,
error_type: String,
error_code: Option<String>,
},
RefreshCredentials {
request_id: String,
result: TelemetryResult,
Expand Down
15 changes: 15 additions & 0 deletions crates/chat-cli/src/telemetry/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,21 @@ impl TelemetryThread {
Ok(self.tx.send(Event::new(EventType::UserLoggedIn {}))?)
}

pub fn send_auth_failed(
&self,
auth_method: &str,
oauth_flow: &str,
error_type: &str,
error_code: Option<String>,
) -> Result<(), TelemetryError> {
Ok(self.tx.send(Event::new(EventType::AuthFailed {
auth_method: auth_method.to_string(),
oauth_flow: oauth_flow.to_string(),
error_type: error_type.to_string(),
error_code,
}))?)
}

pub fn send_daily_heartbeat(&self) -> Result<(), TelemetryError> {
Ok(self.tx.send(Event::new(EventType::DailyHeartbeat {}))?)
}
Expand Down
28 changes: 28 additions & 0 deletions crates/chat-cli/telemetry_definitions.json
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,21 @@
"type": "string",
"description": "The oauth authentication flow executed by the user, e.g. device code or PKCE"
},
{
"name": "codewhispererterminal_authMethod",
"type": "string",
"description": "The authentication method used, e.g. BuilderId or IdentityCenter"
},
{
"name": "codewhispererterminal_errorType",
"type": "string",
"description": "The type of authentication error, e.g. TokenRefresh or NewLogin"
},
{
"name": "codewhispererterminal_errorCode",
"type": "string",
"description": "The specific error code from the authentication service"
},
{
"name": "result",
"type": "string",
Expand Down Expand Up @@ -369,6 +384,19 @@
{ "type": "codewhispererterminal_inCloudshell" }
]
},
{
"name": "codewhispererterminal_authFailed",
"description": "Emitted when authentication fails",
"passive": false,
"metadata": [
{ "type": "credentialStartUrl" },
{ "type": "codewhispererterminal_inCloudshell" },
{ "type": "codewhispererterminal_authMethod" },
{ "type": "oauthFlow" },
{ "type": "codewhispererterminal_errorType" },
{ "type": "codewhispererterminal_errorCode", "required": false }
]
},
{
"name": "codewhispererterminal_refreshCredentials",
"description": "Emitted when users refresh their credentials",
Expand Down