diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 0f2eadf28..b9af7fda3 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,5 +1,5 @@ ### REPOSITORY -/.github/CODEOWNERS @sdushantha +/.github/CODEOWNERS @sdushantha @ppfeister /.github/FUNDING.yml @sdushantha /LICENSE @sdushantha diff --git a/.github/workflows/regression.yml b/.github/workflows/regression.yml index e366f29d7..5029b8704 100644 --- a/.github/workflows/regression.yml +++ b/.github/workflows/regression.yml @@ -11,6 +11,7 @@ on: - '**/*.py' - '**/*.ini' - '**/*.toml' + - 'Dockerfile' push: branches: - master @@ -21,11 +22,13 @@ on: - '**/*.py' - '**/*.ini' - '**/*.toml' + - 'Dockerfile' jobs: tox-lint: - # Linting is ran through tox to ensure that the same linter is used by local runners runs-on: ubuntu-latest + # Linting is ran through tox to ensure that the same linter + # is used by local runners steps: - uses: actions/checkout@v4 - name: Set up linting environment @@ -41,7 +44,8 @@ jobs: tox-matrix: runs-on: ${{ matrix.os }} strategy: - fail-fast: false # We want to know what specicic versions it fails on + # We want to know what specicic versions it fails on + fail-fast: false matrix: os: [ ubuntu-latest, @@ -67,3 +71,22 @@ jobs: pip install tox-gh-actions - name: Run tox run: tox + docker-build-test: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - name: Get version from pyproject.toml + id: get-version + run: | + VERSION=$(grep -m1 'version = ' pyproject.toml | cut -d'"' -f2) + echo "version=$VERSION" >> $GITHUB_OUTPUT + - name: Build Docker image + run: | + docker build \ + --build-arg VERSION_TAG=${{ steps.get-version.outputs.version }} \ + -t sherlock-test:latest . + - name: Test Docker image runs + run: docker run --rm sherlock-test:latest --version diff --git a/Dockerfile b/Dockerfile index 361530abc..ccdfbf230 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,7 +4,7 @@ # 3. Build image with BOTH latest and version tags # i.e. `docker build -t sherlock/sherlock:0.16.0 -t sherlock/sherlock:latest .` -FROM python:3.12-slim-bullseye as build +FROM python:3.12-slim-bullseye AS build WORKDIR /sherlock RUN pip3 install --no-cache-dir --upgrade pip diff --git a/pyproject.toml b/pyproject.toml index 1d66dac68..45dc683d6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,14 +46,10 @@ PySocks = "^1.7.0" requests = "^2.22.0" requests-futures = "^1.0.0" stem = "^1.8.0" -torrequest = "^0.1.0" pandas = "^2.2.1" openpyxl = "^3.0.10" tomli = "^2.2.1" -[tool.poetry.extras] -tor = ["torrequest"] - [tool.poetry.group.dev.dependencies] jsonschema = "^4.0.0" rstr = "^3.2.2" diff --git a/sherlock_project/resources/data.json b/sherlock_project/resources/data.json index 4c84ac524..1f0e34225 100644 --- a/sherlock_project/resources/data.json +++ b/sherlock_project/resources/data.json @@ -79,13 +79,13 @@ "username_claimed": "pink" }, "AllMyLinks": { - "errorMsg": "Not Found", - "errorType": "message", - "regexCheck": "^[a-z0-9][a-z0-9-]{2,32}$", - "url": "https://allmylinks.com/{}", - "urlMain": "https://allmylinks.com/", - "username_claimed": "blue" - }, + "errorMsg": "Page not found", + "errorType": "message", + "regexCheck": "^[a-z0-9][a-z0-9-]{2,32}$", + "url": "https://allmylinks.com/{}", + "urlMain": "https://allmylinks.com/", + "username_claimed": "blue" +}, "AniWorld": { "errorMsg": "Dieses Profil ist nicht verf\u00fcgbar", "errorType": "message", @@ -115,7 +115,7 @@ "username_claimed": "lio24d" }, "Apple Discussions": { - "errorMsg": "The page you tried was not found. You may have used an outdated link or may have typed the address (URL) incorrectly.", + "errorMsg": "Looking for something in Apple Support Communities?", "errorType": "message", "url": "https://discussions.apple.com/profile/{}", "urlMain": "https://discussions.apple.com", @@ -572,8 +572,7 @@ "username_claimed": "brown" }, "CyberDefenders": { - "errorMsg": "Blue Team Training for SOC analysts and DFIR - CyberDefenders", - "errorType": "message", + "errorType": "status_code", "regexCheck": "^[^\\/:*?\"<>|@]{3,50}$", "request_method": "GET", "url": "https://cyberdefenders.org/p/{}", @@ -600,6 +599,12 @@ "urlMain": "https://www.dailymotion.com/", "username_claimed": "blue" }, + "dcinside": { + "errorType": "status_code", + "url": "https://gallog.dcinside.com/{}", + "urlMain": "https://www.dcinside.com/", + "username_claimed": "anrbrb" + }, "Dealabs": { "errorMsg": "La page que vous essayez", "errorType": "message", @@ -608,13 +613,14 @@ "urlMain": "https://www.dealabs.com/", "username_claimed": "blue" }, - "DeviantART": { - "errorType": "status_code", - "regexCheck": "^[a-zA-Z][a-zA-Z0-9_-]*$", - "url": "https://{}.deviantart.com", - "urlMain": "https://deviantart.com", - "username_claimed": "blue" - }, + "DeviantArt": { + "errorType": "message", + "errorMsg": "Llama Not Found", + "regexCheck": "^[a-zA-Z][a-zA-Z0-9_-]*$", + "url": "https://www.deviantart.com/{}", + "urlMain": "https://www.deviantart.com/", + "username_claimed": "blue" +}, "DigitalSpy": { "errorMsg": "The page you were looking for could not be found.", "errorType": "message", @@ -1440,12 +1446,12 @@ "username_claimed": "blue" }, "Mydramalist": { - "errorMsg": "Sign in - MyDramaList", - "errorType": "message", - "url": "https://www.mydramalist.com/profile/{}", - "urlMain": "https://mydramalist.com", - "username_claimed": "elhadidy12398" - }, + "errorMsg": "The requested page was not found", + "errorType": "message", + "url": "https://www.mydramalist.com/profile/{}", + "urlMain": "https://mydramalist.com", + "username_claimed": "elhadidy12398" +}, "Myspace": { "errorType": "status_code", "url": "https://myspace.com/{}", @@ -1459,6 +1465,13 @@ "urlMain": "https://www.native-instruments.com/forum/", "username_claimed": "jambert" }, + "namuwiki": { + "__comment__": "This is a Korean site and it's expected to return false negatives in certain other regions.", + "errorType": "status_code", + "url": "https://namu.wiki/w/%EC%82%AC%EC%9A%A9%EC%9E%90:{}", + "urlMain": "https://namu.wiki/", + "username_claimed": "namu" + }, "NationStates Nation": { "errorMsg": "Was this your nation? It may have ceased to exist due to inactivity, but can rise again!", "errorType": "message", @@ -1809,8 +1822,7 @@ "username_claimed": "blue" }, "Roblox": { - "errorMsg": "Page cannot be found or no longer exists", - "errorType": "message", + "errorType": "status_code", "url": "https://www.roblox.com/user.aspx?username={}", "urlMain": "https://www.roblox.com/", "username_claimed": "bluewolfekiller" @@ -1918,7 +1930,7 @@ }, "SlideShare": { "errorType": "message", - "errorMsg": "Username available", + "errorMsg": "Page no longer exists", "url": "https://slideshare.net/{}", "urlMain": "https://slideshare.net/", "username_claimed": "blue" @@ -1952,6 +1964,13 @@ "urlMain": "https://www.snapchat.com", "username_claimed": "teamsnapchat" }, + "SOOP": { + "errorType": "status_code", + "url": "https://www.sooplive.co.kr/station/{}", + "urlMain": "https://www.sooplive.co.kr/", + "urlProbe": "https://api-channel.sooplive.co.kr/v1.1/channel/{}/station", + "username_claimed": "udkn" + }, "SoundCloud": { "errorType": "status_code", "url": "https://soundcloud.com/{}", @@ -2119,6 +2138,12 @@ "urlMain": "https://themeforest.net/", "username_claimed": "user" }, + "tistory": { + "errorType": "status_code", + "url": "https://{}.tistory.com/", + "urlMain": "https://www.tistory.com/", + "username_claimed": "notice" + }, "TnAFlix": { "errorType": "status_code", "isNSFW": true, @@ -2793,7 +2818,7 @@ "username_claimed": "green" }, "threads": { - "errorMsg": "Threads", + "errorMsg": "Threads • Log in", "errorType": "message", "headers": { "Sec-Fetch-Mode": "navigate" diff --git a/sherlock_project/resources/data.schema.json b/sherlock_project/resources/data.schema.json index 216ffb62c..c717cb256 100644 --- a/sherlock_project/resources/data.schema.json +++ b/sherlock_project/resources/data.schema.json @@ -1,80 +1,149 @@ { - "$schema": "https://json-schema.org/draft/2020-12/schema", - "title": "Sherlock Target Manifest", - "description": "Social media targets to probe for the existence of known usernames", - "type": "object", - "properties": { - "$schema": { "type": "string" } - }, - "patternProperties": { - "^(?!\\$).*?$": { - "type": "object", - "description": "Target name and associated information (key should be human readable name)", - "required": [ "url", "urlMain", "errorType", "username_claimed" ], - "properties": { - "url": { "type": "string" }, - "urlMain": { "type": "string" }, - "urlProbe": { "type": "string" }, - "username_claimed": { "type": "string" }, - "regexCheck": { "type": "string" }, - "isNSFW": { "type": "boolean" }, - "headers": { "type": "object" }, - "request_payload": { "type": "object" }, - "__comment__": { - "type": "string", - "description": "Used to clarify important target information if (and only if) a commit message would not suffice.\nThis key should not be parsed anywhere within Sherlock." - }, - "tags": { - "oneOf": [ - { "$ref": "#/$defs/tag" }, - { "type": "array", "items": { "$ref": "#/$defs/tag" } } - ] - }, - "request_method": { - "type": "string", - "enum": [ "GET", "POST", "HEAD", "PUT" ] - }, - "errorType": { - "type": "string", - "enum": [ "message", "response_url", "status_code" ] - }, - "errorMsg": { - "oneOf": [ - { "type": "string" }, - { "type": "array", "items": { "type": "string" } } - ] - }, - "errorCode": { - "oneOf": [ - { "type": "integer" }, - { "type": "array", "items": { "type": "integer" } } - ] - }, - "errorUrl": { "type": "string" }, - "response_url": { "type": "string" } + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "Sherlock Target Manifest", + "description": "Social media targets to probe for the existence of known usernames", + "type": "object", + "properties": { + "$schema": { "type": "string" } + }, + "patternProperties": { + "^(?!\\$).*?$": { + "type": "object", + "description": "Target name and associated information (key should be human readable name)", + "required": ["url", "urlMain", "errorType", "username_claimed"], + "properties": { + "url": { "type": "string" }, + "urlMain": { "type": "string" }, + "urlProbe": { "type": "string" }, + "username_claimed": { "type": "string" }, + "regexCheck": { "type": "string" }, + "isNSFW": { "type": "boolean" }, + "headers": { "type": "object" }, + "request_payload": { "type": "object" }, + "__comment__": { + "type": "string", + "description": "Used to clarify important target information if (and only if) a commit message would not suffice.\nThis key should not be parsed anywhere within Sherlock." + }, + "tags": { + "oneOf": [ + { "$ref": "#/$defs/tag" }, + { "type": "array", "items": { "$ref": "#/$defs/tag" } } + ] + }, + "request_method": { + "type": "string", + "enum": ["GET", "POST", "HEAD", "PUT"] + }, + "errorType": { + "oneOf": [ + { + "type": "string", + "enum": ["message", "response_url", "status_code"] }, - "dependencies": { - "errorMsg": { - "properties" : { "errorType": { "const": "message" } } - }, - "errorUrl": { - "properties": { "errorType": { "const": "response_url" } } - }, - "errorCode": { - "properties": { "errorType": { "const": "status_code" } } + { + "type": "array", + "items": { + "type": "string", + "enum": ["message", "response_url", "status_code"] + } + } + ] + }, + "errorMsg": { + "oneOf": [ + { "type": "string" }, + { "type": "array", "items": { "type": "string" } } + ] + }, + "errorCode": { + "oneOf": [ + { "type": "integer" }, + { "type": "array", "items": { "type": "integer" } } + ] + }, + "errorUrl": { "type": "string" }, + "response_url": { "type": "string" } + }, + "dependencies": { + "errorMsg": { + "oneOf": [ + { "properties": { "errorType": { "const": "message" } } }, + { + "properties": { + "errorType": { + "type": "array", + "contains": { "const": "message" } } - }, - "if": { "properties": { "errorType": { "const": "message" } } }, - "then": { "required": [ "errorMsg" ] }, - "else": { - "if": { "properties": { "errorType": { "const": "response_url" } } }, - "then": { "required": [ "errorUrl" ] } - }, - "additionalProperties": false + } + } + ] + }, + "errorUrl": { + "oneOf": [ + { "properties": { "errorType": { "const": "response_url" } } }, + { + "properties": { + "errorType": { + "type": "array", + "contains": { "const": "response_url" } + } + } + } + ] + }, + "errorCode": { + "oneOf": [ + { "properties": { "errorType": { "const": "status_code" } } }, + { + "properties": { + "errorType": { + "type": "array", + "contains": { "const": "status_code" } + } + } + } + ] + } + }, + "allOf": [ + { + "if": { + "anyOf": [ + { "properties": { "errorType": { "const": "message" } } }, + { + "properties": { + "errorType": { + "type": "array", + "contains": { "const": "message" } + } + } + } + ] + }, + "then": { "required": ["errorMsg"] } + }, + { + "if": { + "anyOf": [ + { "properties": { "errorType": { "const": "response_url" } } }, + { + "properties": { + "errorType": { + "type": "array", + "contains": { "const": "response_url" } + } + } + } + ] + }, + "then": { "required": ["errorUrl"] } } - }, - "additionalProperties": false, - "$defs": { - "tag": { "type": "string", "enum": [ "adult", "gaming" ] } + ], + "additionalProperties": false } + }, + "additionalProperties": false, + "$defs": { + "tag": { "type": "string", "enum": ["adult", "gaming"] } + } } diff --git a/sherlock_project/sherlock.py b/sherlock_project/sherlock.py index 250175a57..75b3e3d70 100644 --- a/sherlock_project/sherlock.py +++ b/sherlock_project/sherlock.py @@ -171,8 +171,6 @@ def sherlock( username: str, site_data: dict[str, dict[str, str]], query_notify: QueryNotify, - tor: bool = False, - unique_tor: bool = False, dump_response: bool = False, proxy: Optional[str] = None, timeout: int = 60, @@ -188,8 +186,6 @@ def sherlock( query_notify -- Object with base type of QueryNotify(). This will be used to notify the caller about query results. - tor -- Boolean indicating whether to use a tor circuit for the requests. - unique_tor -- Boolean indicating whether to use a new tor circuit for each request. proxy -- String indicating the proxy URL timeout -- Time in seconds to wait before timing out request. Default is 60 seconds. @@ -210,32 +206,9 @@ def sherlock( # Notify caller that we are starting the query. query_notify.start(username) - # Create session based on request methodology - if tor or unique_tor: - try: - from torrequest import TorRequest # noqa: E402 - except ImportError: - print("Important!") - print("> --tor and --unique-tor are now DEPRECATED, and may be removed in a future release of Sherlock.") - print("> If you've installed Sherlock via pip, you can include the optional dependency via `pip install 'sherlock-project[tor]'`.") - print("> Other packages should refer to their documentation, or install it separately with `pip install torrequest`.\n") - sys.exit(query_notify.finish()) - - print("Important!") - print("> --tor and --unique-tor are now DEPRECATED, and may be removed in a future release of Sherlock.") - - # Requests using Tor obfuscation - try: - underlying_request = TorRequest() - except OSError: - print("Tor not found in system path. Unable to continue.\n") - sys.exit(query_notify.finish()) - underlying_session = underlying_request.session - else: - # Normal requests - underlying_session = requests.session() - underlying_request = requests.Request() + # Normal requests + underlying_session = requests.session() # Limit number of workers to 20. # This is probably vastly overkill. @@ -359,15 +332,10 @@ def sherlock( # Store future in data for access later net_info["request_future"] = future - # Reset identify for tor (if needed) - if unique_tor: - underlying_request.reset_identity() - # Add this site's results into final dictionary with all the other results. results_total[social_network] = results_site # Open the file containing account links - # Core logic: If tor requests, make them here. If multi-threaded requests, wait for responses for social_network, net_info in site_data.items(): # Retrieve results again results_site = results_total.get(social_network) @@ -381,6 +349,8 @@ def sherlock( # Get the expected error type error_type = net_info["errorType"] + if isinstance(error_type, str): + error_type: list[str] = [error_type] # Retrieve future and ensure it has finished future = net_info["request_future"] @@ -425,58 +395,60 @@ def sherlock( elif any(hitMsg in r.text for hitMsg in WAFHitMsgs): query_status = QueryStatus.WAF - elif error_type == "message": - # error_flag True denotes no error found in the HTML - # error_flag False denotes error found in the HTML - error_flag = True - errors = net_info.get("errorMsg") - # errors will hold the error message - # it can be string or list - # by isinstance method we can detect that - # and handle the case for strings as normal procedure - # and if its list we can iterate the errors - if isinstance(errors, str): - # Checks if the error message is in the HTML - # if error is present we will set flag to False - if errors in r.text: - error_flag = False - else: - # If it's list, it will iterate all the error message - for error in errors: - if error in r.text: - error_flag = False - break - if error_flag: - query_status = QueryStatus.CLAIMED - else: - query_status = QueryStatus.AVAILABLE - elif error_type == "status_code": - error_codes = net_info.get("errorCode") - query_status = QueryStatus.CLAIMED - - # Type consistency, allowing for both singlets and lists in manifest - if isinstance(error_codes, int): - error_codes = [error_codes] - - if error_codes is not None and r.status_code in error_codes: - query_status = QueryStatus.AVAILABLE - elif r.status_code >= 300 or r.status_code < 200: - query_status = QueryStatus.AVAILABLE - elif error_type == "response_url": - # For this detection method, we have turned off the redirect. - # So, there is no need to check the response URL: it will always - # match the request. Instead, we will ensure that the response - # code indicates that the request was successful (i.e. no 404, or - # forward to some odd redirect). - if 200 <= r.status_code < 300: - query_status = QueryStatus.CLAIMED - else: - query_status = QueryStatus.AVAILABLE else: - # It should be impossible to ever get here... - raise ValueError( - f"Unknown Error Type '{error_type}' for " f"site '{social_network}'" - ) + if any(errtype not in ["message", "status_code", "response_url"] for errtype in error_type): + error_context = f"Unknown error type '{error_type}' for {social_network}" + query_status = QueryStatus.UNKNOWN + else: + if "message" in error_type: + # error_flag True denotes no error found in the HTML + # error_flag False denotes error found in the HTML + error_flag = True + errors = net_info.get("errorMsg") + # errors will hold the error message + # it can be string or list + # by isinstance method we can detect that + # and handle the case for strings as normal procedure + # and if its list we can iterate the errors + if isinstance(errors, str): + # Checks if the error message is in the HTML + # if error is present we will set flag to False + if errors in r.text: + error_flag = False + else: + # If it's list, it will iterate all the error message + for error in errors: + if error in r.text: + error_flag = False + break + if error_flag: + query_status = QueryStatus.CLAIMED + else: + query_status = QueryStatus.AVAILABLE + + if "status_code" in error_type and query_status is not QueryStatus.AVAILABLE: + error_codes = net_info.get("errorCode") + query_status = QueryStatus.CLAIMED + + # Type consistency, allowing for both singlets and lists in manifest + if isinstance(error_codes, int): + error_codes = [error_codes] + + if error_codes is not None and r.status_code in error_codes: + query_status = QueryStatus.AVAILABLE + elif r.status_code >= 300 or r.status_code < 200: + query_status = QueryStatus.AVAILABLE + + if "response_url" in error_type and query_status is not QueryStatus.AVAILABLE: + # For this detection method, we have turned off the redirect. + # So, there is no need to check the response URL: it will always + # match the request. Instead, we will ensure that the response + # code indicates that the request was successful (i.e. no 404, or + # forward to some odd redirect). + if 200 <= r.status_code < 300: + query_status = QueryStatus.CLAIMED + else: + query_status = QueryStatus.AVAILABLE if dump_response: print("+++++++++++++++++++++") @@ -596,22 +568,6 @@ def main(): dest="output", help="If using single username, the output of the result will be saved to this file.", ) - parser.add_argument( - "--tor", - "-t", - action="store_true", - dest="tor", - default=False, - help="Make requests over Tor; increases runtime; requires Tor to be installed and in system path.", - ) - parser.add_argument( - "--unique-tor", - "-u", - action="store_true", - dest="unique_tor", - default=False, - help="Make requests over Tor with new Tor circuit after each request; increases runtime; requires Tor to be installed and in system path.", - ) parser.add_argument( "--csv", action="store_true", @@ -719,12 +675,22 @@ def main(): help="Include checking of NSFW sites from default list.", ) + # TODO deprecated in favor of --txt, retained for workflow compatibility, to be removed + # in future release parser.add_argument( "--no-txt", action="store_true", dest="no_txt", default=False, - help="Disable creation of a txt file", + help="Disable creation of a txt file - WILL BE DEPRECATED", + ) + + parser.add_argument( + "--txt", + action="store_true", + dest="output_txt", + default=False, + help="Enable creation of a txt file", ) parser.add_argument( @@ -742,7 +708,7 @@ def main(): # Check for newer version of Sherlock. If it exists, let the user know about it try: - latest_release_raw = requests.get(forge_api_latest_release).text + latest_release_raw = requests.get(forge_api_latest_release, timeout=10).text latest_release_json = json_loads(latest_release_raw) latest_remote_tag = latest_release_json["tag_name"] @@ -755,22 +721,10 @@ def main(): except Exception as error: print(f"A problem occurred while checking for an update: {error}") - # Argument check - # TODO regex check on args.proxy - if args.tor and (args.proxy is not None): - raise Exception("Tor and Proxy cannot be set at the same time.") - # Make prompts if args.proxy is not None: print("Using the proxy: " + args.proxy) - if args.tor or args.unique_tor: - print("Using Tor to make requests") - - print( - "Warning: some websites might refuse connecting over Tor, so note that using this option might increase connection errors." - ) - if args.no_color: # Disable color output. init(strip=True, convert=False) @@ -802,7 +756,7 @@ def main(): if args.json_file.isnumeric(): pull_number = args.json_file pull_url = f"https://api.github.com/repos/sherlock-project/sherlock/pulls/{pull_number}" - pull_request_raw = requests.get(pull_url).text + pull_request_raw = requests.get(pull_url, timeout=10).text pull_request_json = json_loads(pull_request_raw) # Check if it's a valid pull request @@ -871,8 +825,6 @@ def main(): username, site_data, query_notify, - tor=args.tor, - unique_tor=args.unique_tor, dump_response=args.dump_response, proxy=args.proxy, timeout=args.timeout, @@ -888,7 +840,7 @@ def main(): else: result_file = f"{username}.txt" - if not args.no_txt: + if args.output_txt: with open(result_file, "w", encoding="utf-8") as file: exists_counter = 0 for website_name in results: diff --git a/sherlock_project/sites.py b/sherlock_project/sites.py index 2ba811d77..b7aaf4c58 100644 --- a/sherlock_project/sites.py +++ b/sherlock_project/sites.py @@ -129,7 +129,7 @@ def __init__( if data_file_path.lower().startswith("http"): # Reference is to a URL. try: - response = requests.get(url=data_file_path) + response = requests.get(url=data_file_path, timeout=30) except Exception as error: raise FileNotFoundError( f"Problem while attempting to access data file URL '{data_file_path}': {error}" @@ -166,7 +166,7 @@ def __init__( if honor_exclusions: try: - response = requests.get(url=EXCLUSIONS_URL) + response = requests.get(url=EXCLUSIONS_URL, timeout=10) if response.status_code == 200: exclusions = response.text.splitlines() exclusions = [exclusion.strip() for exclusion in exclusions]