diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 31e01ed..3fdbd7c 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -33,26 +33,25 @@ jobs: uses: actions/setup-python@v5 with: python-version: ${{ matrix.python }} + cache: 'pip' + cache-dependency-path: '**/pyproject.toml' - - name: Install uv - uses: astral-sh/setup-uv@v5 - with: - enable-cache: true - cache-dependency-glob: "pyproject.toml" - - - name: Install package with dev and test dependencies - run: uv sync --extra test + - name: Install package with dependencies + run: pip install -e ".[test]" # for all versions but the one we use for code coverage, run normally - - name: Run unit tests normally - run: uv run pytest + - name: Run unit tests without code coverage + run: pytest if: ${{ matrix.python != env.COV_PYTHON_VERSION }} # run code coverage in one version only - name: Run unit tests with code coverage reporting - run: uv run pytest --cov=undate + run: pytest --cov=. if: ${{ matrix.python == env.COV_PYTHON_VERSION }} - - name: Upload test coverage to Codecov - uses: codecov/codecov-action@v3 + + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v4 + env: + CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} if: ${{ matrix.python == env.COV_PYTHON_VERSION }} diff --git a/examples/pgp_dates.ipynb b/examples/pgp_dates.ipynb new file mode 100644 index 0000000..43a858c --- /dev/null +++ b/examples/pgp_dates.ipynb @@ -0,0 +1,4406 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "2d231f1e-3944-4579-b868-504f7fb2d543", + "metadata": {}, + "source": [ + "# Princeton Geniza Project\n", + "\n", + "This notebook demonstrates parsing dates from non-Gregorian calendars and working with mixed-calendar dates.\n", + "\n", + "This notebook uses document data from the [Princeton Geniza Project](https://geniza.princeton.edu/), which is a database of fragmentary medieval documents found in the Cairo Geniza. Documents are written largely in Hebrew script in Hebrew and Arabic languages, and use a range of calendars including: \n", + "- Hebrew _Anno Mundi_\n", + "- Islamic _Hijri_\n", + "- Hebrew Seleucid calendar (_Anno Mundi_ calendar with a 3449 year offset)\n", + "\n", + "The dataset includes original dates and standardized Common Era dates (Julian before 1583, Gregorian after).\n", + "\n", + "This notebook uses the data published on GitHub at https://github.com/princetongenizalab/pgp-metadata\n", + "\n", + "\n", + "*Notebook authored by Rebecca Sutton Koeser, 2025.*" + ] + }, + { + "cell_type": "markdown", + "id": "9d9da1cf-6cc6-4b6a-9baf-782152998d82", + "metadata": {}, + "source": [ + "## Load and filter data\n", + "\n", + "Limit to documents with authoritative \"date on document\" set in the metadata." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "67c5532d-ebc4-4e1e-aa64-e6802ed1d971", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "pgp_documents_csv = \"https://github.com/princetongenizalab/pgp-metadata/raw/main/data/documents.csv\"\n", + "documents = pd.read_csv(pgp_documents_csv)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "41dc5a05-a04b-4b6d-acfe-1f7b04849346", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Total documents: 35,187\n", + "Documents with dates: 4,451\n", + " date on document: 4,126\n", + " inferred dating: 331\n" + ] + } + ], + "source": [ + "# limit to documents with dates\n", + "docs_with_dates = documents[documents.doc_date_standard.notna() | documents.inferred_date_standard.notna()]\n", + "docs_with_docdate = documents[documents.doc_date_standard.notna()].copy()\n", + "docs_with_inferreddate = documents[documents.inferred_date_standard.notna()]\n", + "\n", + "print(f\"\"\"\n", + "Total documents: {len(documents):,}\n", + "Documents with dates: {len(docs_with_dates):,}\n", + " date on document: {len(docs_with_docdate):,}\n", + " inferred dating: {len(docs_with_inferreddate):,}\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "94d6340b-10d0-461b-b745-378ffa1ffcec", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pgpiddoc_date_originaldoc_date_calendardoc_date_standard
54491570Seleucid1259
1646319 Adar 1427Seleucid1116-03-05
17464Tammuz 1288Seleucid0977-06-21/0977-07-19
234721337Seleucid1025-08-28/1026-09-14
36491NaNNaN1131
41499Wednesday, 15 Kislev 1500Seleucid1188-12-07
43502Tevet 1548Seleucid1236-11-30/1236-12-28
47506Elul 1428Seleucid1117-08-01/1117-08-29
55516First decade of Ḥeshvan 1442Seleucid1130-10-06/1130-10-15
61524Thursday, 12 Sivan 4795Anno Mundi1035-05-22
\n", + "
" + ], + "text/plain": [ + " pgpid doc_date_original doc_date_calendar \\\n", + "5 449 1570 Seleucid \n", + "16 463 19 Adar 1427 Seleucid \n", + "17 464 Tammuz 1288 Seleucid \n", + "23 472 1337 Seleucid \n", + "36 491 NaN NaN \n", + "41 499 Wednesday, 15 Kislev 1500 Seleucid \n", + "43 502 Tevet 1548 Seleucid \n", + "47 506 Elul 1428 Seleucid \n", + "55 516 First decade of Ḥeshvan 1442 Seleucid \n", + "61 524 Thursday, 12 Sivan 4795 Anno Mundi \n", + "\n", + " doc_date_standard \n", + "5 1259 \n", + "16 1116-03-05 \n", + "17 0977-06-21/0977-07-19 \n", + "23 1025-08-28/1026-09-14 \n", + "36 1131 \n", + "41 1188-12-07 \n", + "43 1236-11-30/1236-12-28 \n", + "47 1117-08-01/1117-08-29 \n", + "55 1130-10-06/1130-10-15 \n", + "61 1035-05-22 " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs_with_docdate[['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard']].head(10)" + ] + }, + { + "cell_type": "markdown", + "id": "4df9e446-1f9c-4585-9557-3164cc8ce01f", + "metadata": {}, + "source": [ + "## Parse dates (standard and original)\n", + "\n", + "Parse the standardized date (Julian/Gregorian) as EDTF; in some cases this may fail due to invalid user-entered data." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "b9703b47-a7e2-4178-a7da-fb47db11b5b7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Parse error on 1217-02-20/1217-02-29: Error trying to process rule \"date\":\n", + "\n", + "Day out of range in datetime string \"1217-02-29\"\n", + "Parse error on 1747-02-29: Error trying to process rule \"date\":\n", + "\n", + "Day out of range in datetime string \"1747-02-29\"\n" + ] + } + ], + "source": [ + "from lark.visitors import VisitError\n", + "\n", + "# first, how far can we get with the standard dates? can we parse as edtf and sort, render?\n", + "from undate import Undate \n", + "\n", + "def parse_standard_date(value):\n", + " try:\n", + " return Undate.parse(value, \"EDTF\")\n", + " except VisitError as err:\n", + " print(f\"Parse error on {value}: {err}\")\n", + " \n", + "\n", + "# ignore gregorian/julian distinction for now\n", + "# from pgp code:\n", + "# Julian Thursday, 4 October 1582, being followed by Gregorian Friday, 15 October\n", + "# cut off between gregorian/julian dates, in julian days\n", + "#gregorian_start_jd = convertdate.julianday.from_julian(1582, 10, 5)\n", + "\n", + "docs_with_docdate['undate_standard'] = docs_with_docdate.doc_date_standard.apply(parse_standard_date)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "f49e82a4-b05b-4395-998f-0c9e75729e9f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pgpiddoc_date_originaldoc_date_calendardoc_date_standardlast_modified
31903957middle decade of Adar 1528Seleucid1217-02-20/1217-02-292025-04-12 20:45:36.603800+00:00
3443740006NaNNaN1747-02-292024-08-07 18:24:19.425288+00:00
\n", + "
" + ], + "text/plain": [ + " pgpid doc_date_original doc_date_calendar \\\n", + "3190 3957 middle decade of Adar 1528 Seleucid \n", + "34437 40006 NaN NaN \n", + "\n", + " doc_date_standard last_modified \n", + "3190 1217-02-20/1217-02-29 2025-04-12 20:45:36.603800+00:00 \n", + "34437 1747-02-29 2024-08-07 18:24:19.425288+00:00 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# what are the records with standardized dates that couldn't be parsed?\n", + "\n", + "# this is probably a data error in the original\n", + "\n", + "docs_with_docdate[docs_with_docdate.undate_standard.isna()][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard', 'last_modified']]" + ] + }, + { + "cell_type": "markdown", + "id": "3632e7f2-aae9-4136-9bb0-32789de34c4e", + "metadata": {}, + "source": [ + "What calendars are used by documents with original dates?" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "2d502575-a2b4-4fce-9f59-6932275dfac2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "doc_date_calendar\n", + "Seleucid 1604\n", + "Anno Mundi 1147\n", + "Hijrī 884\n", + "Kharājī 8\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs_with_docdate.doc_date_calendar.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "04e4ffb2-13e7-49cc-913b-2104b61aef16", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pgpiddoc_date_originaldoc_date_calendardoc_date_standard
61524Thursday, 12 Sivan 4795Anno Mundi1035-05-22
9056110 Nisan 4716Anno Mundi0956-03-24
111582Thursday, 6 Adar 4996Anno Mundi1236-02-14
119591Sunday, 29 Tammuz 4898Anno Mundi1138-07-10
1316034805/4806Anno Mundi1044-08-27/1045-09-13
17766022 Sivan 4974Anno Mundi1214-06-01
207695Friday, [25] Nisan [4810]Anno Mundi1050-04-20
2157038 Elul (4)811Anno Mundi1051-08-18
255750Friday, 24 Ḥeshvan 4765Anno Mundi1004-11-10
264760Thursday, 11 Av 4783Anno Mundi1023-08-01
\n", + "
" + ], + "text/plain": [ + " pgpid doc_date_original doc_date_calendar doc_date_standard\n", + "61 524 Thursday, 12 Sivan 4795 Anno Mundi 1035-05-22\n", + "90 561 10 Nisan 4716 Anno Mundi 0956-03-24\n", + "111 582 Thursday, 6 Adar 4996 Anno Mundi 1236-02-14\n", + "119 591 Sunday, 29 Tammuz 4898 Anno Mundi 1138-07-10\n", + "131 603 4805/4806 Anno Mundi 1044-08-27/1045-09-13\n", + "177 660 22 Sivan 4974 Anno Mundi 1214-06-01\n", + "207 695 Friday, [25] Nisan [4810] Anno Mundi 1050-04-20\n", + "215 703 8 Elul (4)811 Anno Mundi 1051-08-18\n", + "255 750 Friday, 24 Ḥeshvan 4765 Anno Mundi 1004-11-10\n", + "264 760 Thursday, 11 Av 4783 Anno Mundi 1023-08-01" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# example hebrew dates\n", + "docs_with_docdate[docs_with_docdate.doc_date_calendar == \"Anno Mundi\"][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard']].head(10)" + ] + }, + { + "cell_type": "markdown", + "id": "101b8194-35b3-4e7e-b3e4-68dfec2e932c", + "metadata": {}, + "source": [ + "### Inspect variations in the data that may cause problems for parsing\n", + "\n", + "There are some ideosyncrasies with the original dates, since some of them were entered before the PGPv4 system supported built-in conversion.\n", + "\n", + "- calendar abbreviation included in the date string (i.e., AM, AH for _Anno Mundi_, _Anno Hegirae_ respectively)\n", + "- brackets for inferred digits or unknown digits (e.g., `152[.]` or `[4]82[.]`)\n", + "- ordinals instead of numerals for the day of the month (e.g., \"11th Tammuz 4767\" or \"Monday, 27th Ṭevet 4797\")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "4d11e583-7c80-44ed-80b1-d0c5b7b7f408", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/mb/6qm4h4yx3yqdy2bv2sjyp4z00000gp/T/ipykernel_38072/1200615794.py:2: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n", + " hebrew_dates = docs_with_docdate[docs_with_docdate.doc_date_calendar == \"Anno Mundi\"][docs_with_docdate.doc_date_original.notna()]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pgpiddoc_date_originaldoc_date_calendardoc_date_standard
7021223Wednesday, 9 Tammuz 4912 AMAnno Mundi1152-06-13
1669819975Sunday, 10 Kislev 5583 AMAnno Mundi1822-11-24
2541530550Tammuz 5537 AMAnno Mundi1777-07-06/1777-08-03
\n", + "
" + ], + "text/plain": [ + " pgpid doc_date_original doc_date_calendar \\\n", + "702 1223 Wednesday, 9 Tammuz 4912 AM Anno Mundi \n", + "16698 19975 Sunday, 10 Kislev 5583 AM Anno Mundi \n", + "25415 30550 Tammuz 5537 AM Anno Mundi \n", + "\n", + " doc_date_standard \n", + "702 1152-06-13 \n", + "16698 1822-11-24 \n", + "25415 1777-07-06/1777-08-03 " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# how many end with AM ?\n", + "hebrew_dates = docs_with_docdate[docs_with_docdate.doc_date_calendar == \"Anno Mundi\"][docs_with_docdate.doc_date_original.notna()]\n", + "hebrew_dates[hebrew_dates.doc_date_original.str.endswith(\"AM\")][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard']]" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "cd1a751a-5299-418f-a3f8-050ab0384354", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pgpiddoc_date_originaldoc_date_calendardoc_date_standard
15562163first third of Tammuz 500[.]Anno Mundi1244/1249
15672175End of Sivan 152[.]Seleucid1209/1218
1753246013[..]Seleucid988/1088
201827451[.] Kislev 48[..]Anno Mundi1039-11-30/1138-11-24
3044380513[..]Seleucid988/1087
...............
305893595512 Muḥarram 52[.]Hijrī1126/1134
312263673854[.]Hijrī1145/1154
325483807714[...]Seleucid1088-09-19/1188-09-23
346524022649[.]Hijrī1096-12-19/1106-09-01
3476040335[4]82[.]Anno Mundi1059-09-11/1069-09-18
\n", + "

66 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " pgpid doc_date_original doc_date_calendar \\\n", + "1556 2163 first third of Tammuz 500[.] Anno Mundi \n", + "1567 2175 End of Sivan 152[.] Seleucid \n", + "1753 2460 13[..] Seleucid \n", + "2018 2745 1[.] Kislev 48[..] Anno Mundi \n", + "3044 3805 13[..] Seleucid \n", + "... ... ... ... \n", + "30589 35955 12 Muḥarram 52[.] Hijrī \n", + "31226 36738 54[.] Hijrī \n", + "32548 38077 14[...] Seleucid \n", + "34652 40226 49[.] Hijrī \n", + "34760 40335 [4]82[.] Anno Mundi \n", + "\n", + " doc_date_standard \n", + "1556 1244/1249 \n", + "1567 1209/1218 \n", + "1753 988/1088 \n", + "2018 1039-11-30/1138-11-24 \n", + "3044 988/1087 \n", + "... ... \n", + "30589 1126/1134 \n", + "31226 1145/1154 \n", + "32548 1088-09-19/1188-09-23 \n", + "34652 1096-12-19/1106-09-01 \n", + "34760 1059-09-11/1069-09-18 \n", + "\n", + "[66 rows x 4 columns]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# how many include periods?\n", + "docs_with_docdate[docs_with_docdate.doc_date_original.notna() & docs_with_docdate.doc_date_original.str.contains(\"\\\\.\")][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard']]" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "9fa8d2ba-6612-4de5-8741-dea177f99412", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pgpiddoc_date_originaldoc_date_calendardoc_date_standard
6351154Last decade of Kislev 5004Anno Mundi1243-12
1172175011th Tammuz 4767Anno Mundi1007
11731751Monday, 27th Ṭevet 4797Anno Mundi1037-01-23
15562163first third of Tammuz 500[.]Anno Mundi1244/1249
51426795last decade of Tishrei 4991Anno Mundi1230-09-29/1230-10-08
52236892last decade of Iyyar 4906Anno Mundi1146-05-04/1146-05-13
56647409last third of Ḥeshvan 4965Anno Mundi1204-10-17/1204-10-25
58127581middle third of Adar 4876Anno Mundi1116-05
70249068Last decade of Ṭevet 4898Anno Mundi1138-01
863811215Middle third of Av 4889Anno Mundi1129-07-29/1129-08-07
\n", + "
" + ], + "text/plain": [ + " pgpid doc_date_original doc_date_calendar \\\n", + "635 1154 Last decade of Kislev 5004 Anno Mundi \n", + "1172 1750 11th Tammuz 4767 Anno Mundi \n", + "1173 1751 Monday, 27th Ṭevet 4797 Anno Mundi \n", + "1556 2163 first third of Tammuz 500[.] Anno Mundi \n", + "5142 6795 last decade of Tishrei 4991 Anno Mundi \n", + "5223 6892 last decade of Iyyar 4906 Anno Mundi \n", + "5664 7409 last third of Ḥeshvan 4965 Anno Mundi \n", + "5812 7581 middle third of Adar 4876 Anno Mundi \n", + "7024 9068 Last decade of Ṭevet 4898 Anno Mundi \n", + "8638 11215 Middle third of Av 4889 Anno Mundi \n", + "\n", + " doc_date_standard \n", + "635 1243-12 \n", + "1172 1007 \n", + "1173 1037-01-23 \n", + "1556 1244/1249 \n", + "5142 1230-09-29/1230-10-08 \n", + "5223 1146-05-04/1146-05-13 \n", + "5664 1204-10-17/1204-10-25 \n", + "5812 1116-05 \n", + "7024 1138-01 \n", + "8638 1129-07-29/1129-08-07 " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# how many use ordinals instead of numerals?\n", + "hebrew_dates[hebrew_dates.doc_date_original.str.contains(\"st\") | hebrew_dates.doc_date_original.str.contains(\"rd\") | hebrew_dates.doc_date_original.str.contains(\"th\")][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard']].head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "5b6d5811-fe81-471d-bd29-896cec4c98ff", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "11th Tammuz 4767: 11 Tammuz 4767\n", + "27th Tevet: 27 Tevet\n", + "8th Kislev: 8 Kislev\n" + ] + } + ], + "source": [ + "import re\n", + "\n", + "def remove_ordinals(val):\n", + " return re.sub(r'(\\d+)(st|nd|rd|th)', \"\\\\1\", val)\n", + "\n", + "# test removing ordinals without removing the numbers\n", + "for val in ['11th Tammuz 4767', \"27th Tevet\", \"8th Kislev\"]:\n", + " print(f\"{val}: { remove_ordinals(val)}\")" + ] + }, + { + "cell_type": "markdown", + "id": "7b0347b7-954b-4d2e-ad95-44dc2e24ac01", + "metadata": {}, + "source": [ + "Since this dataset has a mix of calendars and has known inconsistencies that may need cleaning,\n", + "we define a custom parsing method that selects the appropriate calendar and simplifies date portions that are not currently supported by the undate parsers." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "798da8f2-2332-48c2-aeec-214474e9d49c", + "metadata": {}, + "outputs": [], + "source": [ + "# parse hijri, anno mundi, and seleucid dates as undates\n", + "\n", + "from lark.exceptions import UnexpectedEOF\n", + "\n", + "# set this to True to see details about parsing\n", + "VERBOSE_PARSE_OUTPUT = False \n", + "\n", + "\n", + "def parse_original_date(row):\n", + " # print(f\"PGPID {row.pgpid} {row.doc_date_original} ({row.doc_date_calendar})\")\n", + " undate_calendar = None\n", + " if row.doc_date_calendar == \"Anno Mundi\":\n", + " undate_calendar = \"Hebrew\"\n", + " elif row.doc_date_calendar == \"Hijrī\":\n", + " undate_calendar = \"Islamic\"\n", + " elif row.doc_date_calendar == \"Seleucid\":\n", + " # handle seleucid as hebrew with offset (adapt from pgp code)\n", + " undate_calendar = \"Seleucid\"\n", + "\n", + " \n", + " if undate_calendar:\n", + " value = row.doc_date_original\n", + "\n", + " # some dates have unknown digits, e.g. 1[.] Kislev 48[..] or 152[.]\n", + " # ... the calendar parser don't support this, even though Undate does support unknown digits\n", + " # in future, perhaps we can add missing digit logic with this syntax to share across appropriate parsers\n", + " if '[.' in value:\n", + " if VERBOSE_PARSE_OUTPUT:\n", + " print(f\"ignoring missing digits for now {value}\")\n", + " value = value.replace(\"[.]\", \"0\").replace(\"[..]\", \"00\").replace(\"[...]\", \"000\") \n", + " \n", + " # some dates have inferred numbers, e.g. Friday, [25] Nisan [4810] or 8 Elul (4)811'\n", + " # for now, just strip out brackets before parsing; \n", + " # in future, could potentially infer uncertainty based on these\n", + " value = value.replace('[', '').replace(']', '').replace('(', '').replace(')', '')\n", + "\n", + " # for now, remove modifiers that are not supported by undate parser:\n", + " # Late Tevet 4903, Last decade of Kislev 5004, first third of ...\n", + " # some dates include of, e.g. day of month\n", + " modifiers = [\"Late \", \"(first|middle|last)( third|half|decade|tenth)? (of )?\", \"(Beginning|end) of \", \"last day\", \"First 10 days\", \" of\", \"spring\", \"decade \", \"night, \"]\n", + " for mod in modifiers:\n", + " value = re.sub(mod, \"\", value, flags=re.I)\n", + "\n", + " # there are a handful of misspelled wednesdays...\n", + " value = value.replace(\"Wedensday\", \"Wednesday\")\n", + " # and a Thrusday\n", + " value = value.replace(\"Thrusday\", \"Thursday\")\n", + "\n", + " # three Hebrew calendar dates include text \"AM\" at end; at least one AH date\n", + " if value.endswith(\" AM\") or value.endswith(\" AH\"):\n", + " value = value[:-3]\n", + " if value.endswith(\".\"): # strip off trailing period\n", + " value = value[:-1]\n", + "\n", + " # about 62 have ordinals; strip them out\n", + " value = remove_ordinals(value)\n", + " \n", + " try:\n", + " return Undate.parse(value, undate_calendar)\n", + " except (VisitError, ValueError, UnexpectedEOF) as err:\n", + " if VERBOSE_PARSE_OUTPUT:\n", + " print(f\"Parse error on PGPID {row.pgpid} {value} ({undate_calendar}): {err}\")\n", + "\n", + " # there are a handful of cases in PGP where calendars are mixed,\n", + " # i.e. hebrew months used for hijri calendar\n", + "\n", + " # some dates are entered in ISO format for another calendar; can we parse and set calendar?\n", + " if \"-\" in value and \"/\" not in value: # exclude intervals for now\n", + " try:\n", + " parsed = Undate.parse(value, \"ISO8601\")\n", + " if parsed:\n", + " parsed = parsed.as_calendar(undate_calendar)\n", + " if VERBOSE_PARSE_OUTPUT:\n", + " print(f\"parsed {value} with ISO8601 format and calendar {undate_calendar}, result is {parsed} ({parsed.earliest}/{parsed.latest})\")\n", + " return parsed\n", + " except ValueError as err:\n", + " if VERBOSE_PARSE_OUTPUT:\n", + " print(f\"Could not parse {value} as ISO date: {err}\")\n", + "\n", + "docs_with_docdate['undate_orig'] = docs_with_docdate.apply(parse_original_date, axis=1)" + ] + }, + { + "cell_type": "markdown", + "id": "6b2bfb96-2d8b-4f09-a9a6-c2534273d503", + "metadata": {}, + "source": [ + "### Review parsing results \n", + "\n", + "How many of the dates in supported calendars were parsed?" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "623eb160-ab6c-44ba-b3f4-6770c2c7bd86", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "original dates parsed: 3462\n", + "original dates unparsed: 173 (anno mundi, hijri, and seleucid calendars)\n", + "proportion parsed: 95.24%\n" + ] + } + ], + "source": [ + "orig_dates_parsed = docs_with_docdate[docs_with_docdate.undate_orig.notna()].copy()\n", + "orig_dates_unparsed = docs_with_docdate[docs_with_docdate.doc_date_original.notna() & docs_with_docdate.doc_date_calendar.isin(['Anno Mundi', 'Hijrī', 'Seleucid']) & docs_with_docdate.undate_orig.isna()] \n", + "\n", + "total_parsed = len(orig_dates_parsed)\n", + "total_unparsed = len(orig_dates_unparsed)\n", + "print(f\"\"\"original dates parsed: {total_parsed}\n", + "original dates unparsed: {total_unparsed} (anno mundi, hijri, and seleucid calendars)\n", + "proportion parsed: {(total_parsed/(total_parsed + total_unparsed))*100:0.2f}%\"\"\")" + ] + }, + { + "cell_type": "markdown", + "id": "ae5b3cfa-ed25-4a3d-ae78-c7590543ba20", + "metadata": {}, + "source": [ + "What is the date granularity of the dates that were parsed?\n", + "\n", + "Note that these results are skewed somewhat due to the modifiers and uncertainty that we are simplifying in order to parse the dates." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "42945787-6788-422d-9a04-f983ec6b31af", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pgpiddoc_date_originaldoc_date_calendardoc_date_standardundate_standardundate_origorig_date_precision
54491570Seleucid125912591570year
1646319 Adar 1427Seleucid1116-03-051116-03-051427-12-19day
17464Tammuz 1288Seleucid0977-06-21/0977-07-190977-06-21/0977-07-191288-04month
234721337Seleucid1025-08-28/1026-09-141025-08-28/1026-09-141337year
41499Wednesday, 15 Kislev 1500Seleucid1188-12-071188-12-071500-09-15day
\n", + "
" + ], + "text/plain": [ + " pgpid doc_date_original doc_date_calendar doc_date_standard \\\n", + "5 449 1570 Seleucid 1259 \n", + "16 463 19 Adar 1427 Seleucid 1116-03-05 \n", + "17 464 Tammuz 1288 Seleucid 0977-06-21/0977-07-19 \n", + "23 472 1337 Seleucid 1025-08-28/1026-09-14 \n", + "41 499 Wednesday, 15 Kislev 1500 Seleucid 1188-12-07 \n", + "\n", + " undate_standard undate_orig orig_date_precision \n", + "5 1259 1570 year \n", + "16 1116-03-05 1427-12-19 day \n", + "17 0977-06-21/0977-07-19 1288-04 month \n", + "23 1025-08-28/1026-09-14 1337 year \n", + "41 1188-12-07 1500-09-15 day " + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# determine original date precision based on parsed undate\n", + "orig_dates_parsed['orig_date_precision'] = orig_dates_parsed.undate_orig.apply(lambda x: str(x.precision).lower())\n", + "orig_dates_parsed[['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard', 'undate_standard', 'undate_orig', 'orig_date_precision']].head()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "88f1d3ab-e1c7-48b5-8907-5aeea463f1e8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "orig_date_precision\n", + "day 1599\n", + "month 1027\n", + "year 836\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# this is skewed because of the kinds of dates we're not able to parse or modifiers we're omitting entirely\n", + "orig_dates_parsed.orig_date_precision.value_counts()" + ] + }, + { + "cell_type": "markdown", + "id": "3fec8455-1830-48b5-961c-4ae0847bf63c", + "metadata": {}, + "source": [ + "Check on the Seleucid date parsing by comparing undate calendar conversion with the standardized CE date included in the dataset.\n", + "\n", + "We expect `undate` dates before 1583 to be off by about ~ 10 days since we did not adjust for Julian calendar." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "5d3a55b0-ed36-47ba-b022-848bb128b449", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pgpiddoc_date_originaldoc_date_calendarundate_origorig_date_precisiondoc_date_standardundate_earliestundate_latest
54491570Seleucid1570year12591258-09-071259-09-26
1646319 Adar 1427Seleucid1427-12-19day1116-03-051116-03-121116-03-12
17464Tammuz 1288Seleucid1288-04month0977-06-21/0977-07-190977-06-260977-07-24
234721337Seleucid1337year1025-08-28/1026-09-141025-09-031026-09-20
41499Wednesday, 15 Kislev 1500Seleucid1500-09-15day1188-12-071188-12-141188-12-14
43502Tevet 1548Seleucid1548-10month1236-11-30/1236-12-281236-12-071237-01-04
47506Elul 1428Seleucid1428-06month1117-08-01/1117-08-291117-08-081117-09-05
55516First decade of Ḥeshvan 1442Seleucid1442-08month1130-10-06/1130-10-151130-10-131130-11-10
73537Ḥeshvan 1453Seleucid1453-08month11411141-10-111141-11-08
75544Sunday, 21 Kislev 1355Seleucid1355-09-21day1043-11-261043-12-021043-12-02
\n", + "
" + ], + "text/plain": [ + " pgpid doc_date_original doc_date_calendar undate_orig \\\n", + "5 449 1570 Seleucid 1570 \n", + "16 463 19 Adar 1427 Seleucid 1427-12-19 \n", + "17 464 Tammuz 1288 Seleucid 1288-04 \n", + "23 472 1337 Seleucid 1337 \n", + "41 499 Wednesday, 15 Kislev 1500 Seleucid 1500-09-15 \n", + "43 502 Tevet 1548 Seleucid 1548-10 \n", + "47 506 Elul 1428 Seleucid 1428-06 \n", + "55 516 First decade of Ḥeshvan 1442 Seleucid 1442-08 \n", + "73 537 Ḥeshvan 1453 Seleucid 1453-08 \n", + "75 544 Sunday, 21 Kislev 1355 Seleucid 1355-09-21 \n", + "\n", + " orig_date_precision doc_date_standard undate_earliest undate_latest \n", + "5 year 1259 1258-09-07 1259-09-26 \n", + "16 day 1116-03-05 1116-03-12 1116-03-12 \n", + "17 month 0977-06-21/0977-07-19 0977-06-26 0977-07-24 \n", + "23 year 1025-08-28/1026-09-14 1025-09-03 1026-09-20 \n", + "41 day 1188-12-07 1188-12-14 1188-12-14 \n", + "43 month 1236-11-30/1236-12-28 1236-12-07 1237-01-04 \n", + "47 month 1117-08-01/1117-08-29 1117-08-08 1117-09-05 \n", + "55 month 1130-10-06/1130-10-15 1130-10-13 1130-11-10 \n", + "73 month 1141 1141-10-11 1141-11-08 \n", + "75 day 1043-11-26 1043-12-02 1043-12-02 " + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "seleucid_dates = orig_dates_parsed[orig_dates_parsed.doc_date_calendar == 'Seleucid'].copy()\n", + "# add undate earliest/latest (Gregorian) for comparison with dataset standardized date \n", + "seleucid_dates['undate_earliest'] = seleucid_dates.undate_orig.apply(lambda x: x.earliest)\n", + "seleucid_dates['undate_latest'] = seleucid_dates.undate_orig.apply(lambda x: x.latest)\n", + "\n", + "seleucid_dates[['pgpid', 'doc_date_original', 'doc_date_calendar', 'undate_orig', 'orig_date_precision', 'doc_date_standard', 'undate_earliest', 'undate_latest']].head(10)\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "a104d772-6c2c-4711-91ec-8cf1f108ae23", + "metadata": {}, + "outputs": [], + "source": [ + "# can we sort by parsed original dates? \n", + "# doesn't work currently because of overlapping dates / different granularity\n", + "#orig_dates_parsed.sort_values(by='undate_orig') #, key=lambda col: col.value.earliest)" + ] + }, + { + "cell_type": "markdown", + "id": "29f5f6eb-9b7d-4a4a-815a-29002d1d024b", + "metadata": {}, + "source": [ + "## Plot documents by date\n", + "\n", + "For the dates we could parse, how are the documents distributed over time and calendar?\n", + "\n", + "First let's graph by year based on the midpoint of the date range." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "c653d928-8fec-4ddc-9abf-ace2f7ca6629", + "metadata": {}, + "outputs": [], + "source": [ + "# set earliest/latest for graphing\n", + "\n", + "# NOTE: we have to cast type to something pandas/altair supports\n", + "\n", + "orig_dates_parsed['orig_date_earliest'] = orig_dates_parsed.undate_orig.apply(lambda x: x.earliest).astype('datetime64[s]')\n", + "orig_dates_parsed['orig_date_latest'] = orig_dates_parsed.undate_orig.apply(lambda x: x.latest).astype('datetime64[s]')\n", + "orig_dates_parsed['orig_date_mid'] = orig_dates_parsed.undate_orig.apply(lambda x: x.earliest + (x.latest - x.earliest)/2).astype('datetime64[s]')" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "91f155fe-d0e6-4ee4-99de-698ac301e3f3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
orig_date_earliestorig_date_latestorig_date_midpgpiddoc_date_calendar
51258-09-071259-09-261259-03-18449Seleucid
161116-03-121116-03-121116-03-12463Seleucid
17977-06-26977-07-24977-07-10464Seleucid
231025-09-031026-09-201026-03-13472Seleucid
411188-12-141188-12-141188-12-14499Seleucid
431236-12-071237-01-041236-12-21502Seleucid
471117-08-081117-09-051117-08-22506Seleucid
551130-10-131130-11-101130-10-27516Seleucid
611035-05-281035-05-281035-05-28524Anno Mundi
621034-08-251034-09-221034-09-08525Hijrī
\n", + "
" + ], + "text/plain": [ + " orig_date_earliest orig_date_latest orig_date_mid pgpid doc_date_calendar\n", + "5 1258-09-07 1259-09-26 1259-03-18 449 Seleucid\n", + "16 1116-03-12 1116-03-12 1116-03-12 463 Seleucid\n", + "17 977-06-26 977-07-24 977-07-10 464 Seleucid\n", + "23 1025-09-03 1026-09-20 1026-03-13 472 Seleucid\n", + "41 1188-12-14 1188-12-14 1188-12-14 499 Seleucid\n", + "43 1236-12-07 1237-01-04 1236-12-21 502 Seleucid\n", + "47 1117-08-08 1117-09-05 1117-08-22 506 Seleucid\n", + "55 1130-10-13 1130-11-10 1130-10-27 516 Seleucid\n", + "61 1035-05-28 1035-05-28 1035-05-28 524 Anno Mundi\n", + "62 1034-08-25 1034-09-22 1034-09-08 525 Hijrī" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "orig_dates_parsed[['orig_date_earliest', 'orig_date_latest', 'orig_date_mid', 'pgpid', 'doc_date_calendar']].head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "144b2a4a-81cf-4a6d-a277-3a7910354a77", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# graph documents by calendar\n", + "import altair as alt\n", + "\n", + "date_docs_cal = orig_dates_parsed[orig_dates_parsed.doc_date_standard.notna()]\n", + "\n", + "dated_docs_cal = date_docs_cal.fillna({'doc_date_calendar': 'Unspecified'})\n", + "dated_docs_cal['midpoint_year'] = dated_docs_cal.orig_date_mid.apply(lambda x: x.year)\n", + "\n", + "orig_dates_calendars_chart = alt.Chart(dated_docs_cal[['pgpid', 'midpoint_year', 'doc_date_calendar']]).mark_area(opacity=0.7).encode(\n", + " x=alt.X('midpoint_year', title=\"Year (midpoint)\", bin=alt.Bin(maxbins=120), axis=alt.Axis(format=\"r\")),\n", + " y=alt.Y('count(pgpid)', title='Documents'),\n", + " color=alt.Y(\"doc_date_calendar\", title=\"Calendar\")\n", + ").properties(width=900, height=200, title=\"Documents by calendar (original date)\")\n", + "\n", + "orig_dates_calendars_chart" + ] + }, + { + "cell_type": "markdown", + "id": "a8e8cd7c-0711-40ae-84f6-d3f8df6d5ccc", + "metadata": {}, + "source": [ + "For comparison, what does it look like if we graph by the standardized dates in the dataset?" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "4acc9a2b-d403-4f93-b2c5-6fee92ead105", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# graph documents with calendars\n", + "\n", + "def undate_midpoint(value):\n", + " # parsed standard date could be an undate or an interval; handle either\n", + " if isinstance(value, Undate):\n", + " earliest = value.earliest\n", + " latest = value.latest\n", + " else: # interval\n", + " earliest = value.earliest.earliest\n", + " latest = value.latest.latest\n", + " return earliest + (latest - earliest)/2\n", + " \n", + "\n", + "dated_docs_cal = docs_with_docdate.copy()\n", + "dated_docs_cal = dated_docs_cal.fillna({'doc_date_calendar': 'Unspecified'})\n", + "# get the midpoint from the parsed standard date; convert to supported type\n", + "dated_docs_cal['midpoint'] = dated_docs_cal.undate_standard.apply(lambda x: undate_midpoint(x) if pd.notna(x) else None).astype(\"datetime64[s]\")\n", + "dated_docs_cal['midpoint_year'] = dated_docs_cal.midpoint.apply(lambda x: x.year if pd.notna(x) else None)\n", + "\n", + "\n", + "std_dates_calendars_chart = alt.Chart(dated_docs_cal[['pgpid', 'midpoint_year', 'doc_date_calendar']]).mark_area(opacity=0.7).encode(\n", + " x=alt.X('midpoint_year', title=\"Year\", bin=alt.Bin(maxbins=120), axis=alt.Axis(format=\"r\")),\n", + " y=alt.Y('count(pgpid)', title='Documents'),\n", + " color=alt.Y(\"doc_date_calendar\", title=\"Calendar\").scale(domain=['Anno Mundi', 'Hijrī', 'Seleucid', 'Kharājī', 'Unspecified'])\n", + ").properties(width=900, height=200, title=\"Documents by calendar (standard date)\")\n", + "\n", + "std_dates_calendars_chart" + ] + }, + { + "cell_type": "markdown", + "id": "f42471a4-0c64-4237-92c0-0d201377fa9f", + "metadata": {}, + "source": [ + "Here are the two plots together. The unspecified calendars are most likely Julian/Gregorian dates." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "4d7c4d5f-636c-42a0-a906-21c67f5781b8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.VConcatChart(...)" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "orig_dates_calendars_chart & std_dates_calendars_chart" + ] + }, + { + "cell_type": "markdown", + "id": "dc8a4617-ca69-4494-a2ef-6f4d442b82e6", + "metadata": {}, + "source": [ + "We can try graphing by range, but our parsing currently excludes the original dates with larger ranges." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "c5861110-dbd5-4d7a-8ada-acf7cb871aa7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.VConcatChart(...)" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "graphable_data = orig_dates_parsed[['orig_date_earliest', 'orig_date_latest', 'orig_date_mid', 'pgpid', 'doc_date_calendar']].copy()\n", + "# graphable_data['midpoint'] = graphable_data.undate_standard.apply(lambda x: undate_midpoint(x) if pd.notna(x) else None).astype(\"datetime64[s]\")\n", + "graphable_data['midpoint_year'] = graphable_data.orig_date_mid.apply(lambda x: x.year if pd.notna(x) else None)\n", + "\n", + "\n", + "bar_chart = alt.Chart(graphable_data).mark_bar(opacity=0.5).encode(\n", + " x=alt.X('orig_date_earliest:T', title=\"original date (range)\"), # , axis=alt.Axis(format=\"r\")),\n", + " x2='orig_date_latest:T',\n", + " y=alt.Y('count(pgpid)', title='Count of Documents')\n", + ").properties(width=1200, height=150)\n", + "\n", + "line_chart = alt.Chart(graphable_data).mark_line(opacity=0.6, color=\"green\", interpolate=\"monotone\").encode(\n", + " x=alt.X('orig_date_mid:T', title=\"Year (midpoint)\"),\n", + " y=alt.Y('count(pgpid)', title='Documents')\n", + ").properties(width=1200, height=150)\n", + "\n", + "(bar_chart & line_chart).properties(title=\"Documents by date (1000-1300)\").interactive()" + ] + }, + { + "cell_type": "markdown", + "id": "951d92ea-4689-481c-8590-324b782a7a1c", + "metadata": {}, + "source": [ + "## Compare weekdays\n", + "\n", + "Sometimes the original date includes a day of the week; we don't expect these to be completely reliable, but lets compare the weekdays in the original date with the weekday as determined by the parsed `Undate`." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "3122a874-bb17-429f-993f-4bf7a76c1a36", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pgpiddoc_date_originaldoc_date_calendardoc_date_standardundate_standardundate_origorig_date_precisiontype
8511377Wednesday night, 28 Sivan 1581Seleucid127012701581-03-28dayLegal document
17142418Monday 20 Tevet 1520Seleucid1208-12-291208-12-291520-10-20dayLegal document
19292649Sunday night, 25 Kislev 1444Seleucid113311331444-09-25dayLegal document
20132739Wednesday 29th Elul 1354Seleucid1043-09-071043-09-071354-06-29dayLegal document
32574026Wednesday night, 29 Tishrei 1541Seleucid1229-09-181229-09-181541-07-29dayLegal document
...........................
2930334623Sunday night, 20 Ṭevet 1578Seleucid1266/12671266/12671578-10-20dayLegal document
2992435264Wednesday 13 Ṭevet 1526Seleucid1214/12151214/12151526-10-13dayLegal document
3400839564Monday 16 Tevet 1339Seleucid1027-12-181027-12-181339-10-16dayLegal document
3446640035Monday 1st Iyyar 1437Seleucid1126-04-261126-04-261437-02-01dayLegal document
3446740036Friday 15 of Adar 1443Seleucid1132-03-041132-03-041443-12-15dayLegal document
\n", + "

104 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " pgpid doc_date_original doc_date_calendar \\\n", + "851 1377 Wednesday night, 28 Sivan 1581 Seleucid \n", + "1714 2418 Monday 20 Tevet 1520 Seleucid \n", + "1929 2649 Sunday night, 25 Kislev 1444 Seleucid \n", + "2013 2739 Wednesday 29th Elul 1354 Seleucid \n", + "3257 4026 Wednesday night, 29 Tishrei 1541 Seleucid \n", + "... ... ... ... \n", + "29303 34623 Sunday night, 20 Ṭevet 1578 Seleucid \n", + "29924 35264 Wednesday 13 Ṭevet 1526 Seleucid \n", + "34008 39564 Monday 16 Tevet 1339 Seleucid \n", + "34466 40035 Monday 1st Iyyar 1437 Seleucid \n", + "34467 40036 Friday 15 of Adar 1443 Seleucid \n", + "\n", + " doc_date_standard undate_standard undate_orig orig_date_precision \\\n", + "851 1270 1270 1581-03-28 day \n", + "1714 1208-12-29 1208-12-29 1520-10-20 day \n", + "1929 1133 1133 1444-09-25 day \n", + "2013 1043-09-07 1043-09-07 1354-06-29 day \n", + "3257 1229-09-18 1229-09-18 1541-07-29 day \n", + "... ... ... ... ... \n", + "29303 1266/1267 1266/1267 1578-10-20 day \n", + "29924 1214/1215 1214/1215 1526-10-13 day \n", + "34008 1027-12-18 1027-12-18 1339-10-16 day \n", + "34466 1126-04-26 1126-04-26 1437-02-01 day \n", + "34467 1132-03-04 1132-03-04 1443-12-15 day \n", + "\n", + " type \n", + "851 Legal document \n", + "1714 Legal document \n", + "1929 Legal document \n", + "2013 Legal document \n", + "3257 Legal document \n", + "... ... \n", + "29303 Legal document \n", + "29924 Legal document \n", + "34008 Legal document \n", + "34466 Legal document \n", + "34467 Legal document \n", + "\n", + "[104 rows x 8 columns]" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "weekday_dates = orig_dates_parsed[orig_dates_parsed.doc_date_original.str.contains('day ')][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard', 'undate_standard', 'undate_orig', 'orig_date_precision', 'type']]\n", + "weekday_dates" + ] + }, + { + "cell_type": "markdown", + "id": "d9c03fd7-731c-44ce-ae2d-0bc1308790d0", + "metadata": {}, + "source": [ + "Extract the weekday from the original date and determine the undate weekday.\n", + "\n", + "Both Arabic and Hebrew days begin in the evening, so if the date string includes the text \"night\" we shift the original day by one for comparison." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "3e4ea50c-b11c-433b-b6f9-691098b057d3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pgpiddoc_date_originaldoc_date_calendardoc_date_standardundate_standardundate_origorig_date_precisiontypeundate_weekdayundate_weekday_nameorig_weekday
8511377Wednesday night, 28 Sivan 1581Seleucid127012701581-03-28dayLegal document3ThursdayThursday
17142418Monday 20 Tevet 1520Seleucid1208-12-291208-12-291520-10-20dayLegal document0MondayMonday
19292649Sunday night, 25 Kislev 1444Seleucid113311331444-09-25dayLegal document0MondayMonday
20132739Wednesday 29th Elul 1354Seleucid1043-09-071043-09-071354-06-29dayLegal document2WednesdayWednesday
32574026Wednesday night, 29 Tishrei 1541Seleucid1229-09-181229-09-181541-07-29dayLegal document3ThursdayThursday
....................................
2930334623Sunday night, 20 Ṭevet 1578Seleucid1266/12671266/12671578-10-20dayLegal document0MondayMonday
2992435264Wednesday 13 Ṭevet 1526Seleucid1214/12151214/12151526-10-13dayLegal document2WednesdayWednesday
3400839564Monday 16 Tevet 1339Seleucid1027-12-181027-12-181339-10-16dayLegal document0MondayMonday
3446640035Monday 1st Iyyar 1437Seleucid1126-04-261126-04-261437-02-01dayLegal document0MondayMonday
3446740036Friday 15 of Adar 1443Seleucid1132-03-041132-03-041443-12-15dayLegal document4FridayFriday
\n", + "

104 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " pgpid doc_date_original doc_date_calendar \\\n", + "851 1377 Wednesday night, 28 Sivan 1581 Seleucid \n", + "1714 2418 Monday 20 Tevet 1520 Seleucid \n", + "1929 2649 Sunday night, 25 Kislev 1444 Seleucid \n", + "2013 2739 Wednesday 29th Elul 1354 Seleucid \n", + "3257 4026 Wednesday night, 29 Tishrei 1541 Seleucid \n", + "... ... ... ... \n", + "29303 34623 Sunday night, 20 Ṭevet 1578 Seleucid \n", + "29924 35264 Wednesday 13 Ṭevet 1526 Seleucid \n", + "34008 39564 Monday 16 Tevet 1339 Seleucid \n", + "34466 40035 Monday 1st Iyyar 1437 Seleucid \n", + "34467 40036 Friday 15 of Adar 1443 Seleucid \n", + "\n", + " doc_date_standard undate_standard undate_orig orig_date_precision \\\n", + "851 1270 1270 1581-03-28 day \n", + "1714 1208-12-29 1208-12-29 1520-10-20 day \n", + "1929 1133 1133 1444-09-25 day \n", + "2013 1043-09-07 1043-09-07 1354-06-29 day \n", + "3257 1229-09-18 1229-09-18 1541-07-29 day \n", + "... ... ... ... ... \n", + "29303 1266/1267 1266/1267 1578-10-20 day \n", + "29924 1214/1215 1214/1215 1526-10-13 day \n", + "34008 1027-12-18 1027-12-18 1339-10-16 day \n", + "34466 1126-04-26 1126-04-26 1437-02-01 day \n", + "34467 1132-03-04 1132-03-04 1443-12-15 day \n", + "\n", + " type undate_weekday undate_weekday_name orig_weekday \n", + "851 Legal document 3 Thursday Thursday \n", + "1714 Legal document 0 Monday Monday \n", + "1929 Legal document 0 Monday Monday \n", + "2013 Legal document 2 Wednesday Wednesday \n", + "3257 Legal document 3 Thursday Thursday \n", + "... ... ... ... ... \n", + "29303 Legal document 0 Monday Monday \n", + "29924 Legal document 2 Wednesday Wednesday \n", + "34008 Legal document 0 Monday Monday \n", + "34466 Legal document 0 Monday Monday \n", + "34467 Legal document 4 Friday Friday \n", + "\n", + "[104 rows x 11 columns]" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "days = [\"Monday\", \"Tuesday\", \"Wednesday\", \"Thursday\", \"Friday\", \"Saturday\", \"Sunday\"]\n", + "\n", + "# get numeric weekday; since these dates are all day-precision we can just use the earliest date\n", + "weekday_dates['undate_weekday'] = weekday_dates.undate_orig.apply(lambda x: x.earliest.weekday)\n", + "weekday_dates['undate_weekday_name'] = weekday_dates.undate_weekday.apply(lambda x: days[x])\n", + "# extract weekday from date label\n", + "weekday_dates['orig_weekday'] = weekday_dates.doc_date_original.str.extract('([a-zA-Z]+day)', expand=False).str.strip()\n", + "# correct misspellings\n", + "misspelled_days = {\n", + " \"Wedensday\": \"Wednesday\",\n", + " \"Thrusday\": \"Thursday\",\n", + "}\n", + "weekday_dates['orig_weekday'] = weekday_dates.orig_weekday.apply(lambda x: misspelled_days.get(x, x))\n", + "\n", + "# shift night to next day, e.g. Wednesday night should be Thursday\n", + "# NOTE: this must be done immediately after the day extraction, otherwise repeated runs continue shifting to the next day\n", + "def next_day(weekday):\n", + " return days[(days.index(weekday) +1) % 7]\n", + "\n", + "weekday_dates['orig_weekday'] = weekday_dates.apply(lambda row: next_day(row.orig_weekday) if \" night\" in row.doc_date_original else row.orig_weekday, axis=1)\n", + "\n", + "weekday_dates" + ] + }, + { + "cell_type": "markdown", + "id": "c3ab3428-9700-4e57-b3ff-329c737d98f7", + "metadata": {}, + "source": [ + "Here are the subset of records that specify \"night\":" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "4ced7809-1414-44ae-aae7-c2d0d1dee9ad", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pgpiddoc_date_originaldoc_date_calendardoc_date_standardundate_standardundate_origorig_date_precisiontypeundate_weekdayundate_weekday_nameorig_weekday
8511377Wednesday night, 28 Sivan 1581Seleucid127012701581-03-28dayLegal document3ThursdayThursday
19292649Sunday night, 25 Kislev 1444Seleucid113311331444-09-25dayLegal document0MondayMonday
32574026Wednesday night, 29 Tishrei 1541Seleucid1229-09-181229-09-181541-07-29dayLegal document3ThursdayThursday
55117237Tuesday night, 22 Kislev 1435Seleucid1123-12-121123-12-121435-09-22dayLegal document2WednesdayWednesday
58547637Monday night, 29 Ṭevet 1438Seleucid112711271438-10-29dayLegal document4FridayTuesday
58577642Thursday night, 23 Tammuz 1538Seleucid1227-07-091227-07-091538-04-23dayLegal document4FridayFriday
64198332Friday night, 20 Iyar 4957Anno Mundi1197-051197-054957-02-20dayLegal document5SaturdaySaturday
2930334623Sunday night, 20 Ṭevet 1578Seleucid1266/12671266/12671578-10-20dayLegal document0MondayMonday
\n", + "
" + ], + "text/plain": [ + " pgpid doc_date_original doc_date_calendar \\\n", + "851 1377 Wednesday night, 28 Sivan 1581 Seleucid \n", + "1929 2649 Sunday night, 25 Kislev 1444 Seleucid \n", + "3257 4026 Wednesday night, 29 Tishrei 1541 Seleucid \n", + "5511 7237 Tuesday night, 22 Kislev 1435 Seleucid \n", + "5854 7637 Monday night, 29 Ṭevet 1438 Seleucid \n", + "5857 7642 Thursday night, 23 Tammuz 1538 Seleucid \n", + "6419 8332 Friday night, 20 Iyar 4957 Anno Mundi \n", + "29303 34623 Sunday night, 20 Ṭevet 1578 Seleucid \n", + "\n", + " doc_date_standard undate_standard undate_orig orig_date_precision \\\n", + "851 1270 1270 1581-03-28 day \n", + "1929 1133 1133 1444-09-25 day \n", + "3257 1229-09-18 1229-09-18 1541-07-29 day \n", + "5511 1123-12-12 1123-12-12 1435-09-22 day \n", + "5854 1127 1127 1438-10-29 day \n", + "5857 1227-07-09 1227-07-09 1538-04-23 day \n", + "6419 1197-05 1197-05 4957-02-20 day \n", + "29303 1266/1267 1266/1267 1578-10-20 day \n", + "\n", + " type undate_weekday undate_weekday_name orig_weekday \n", + "851 Legal document 3 Thursday Thursday \n", + "1929 Legal document 0 Monday Monday \n", + "3257 Legal document 3 Thursday Thursday \n", + "5511 Legal document 2 Wednesday Wednesday \n", + "5854 Legal document 4 Friday Tuesday \n", + "5857 Legal document 4 Friday Friday \n", + "6419 Legal document 5 Saturday Saturday \n", + "29303 Legal document 0 Monday Monday " + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "weekday_dates[weekday_dates.doc_date_original.str.contains(\" night\")]" + ] + }, + { + "cell_type": "markdown", + "id": "94b8aae8-6bc1-425c-b723-427356cfb647", + "metadata": {}, + "source": [ + "How many of the original and undate weekdays match?" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "fedb5323-0e9c-476e-a7e2-95443d2f9e1d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "44 matches, 60 mismatches (42.31%)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pgpiddoc_date_originaldoc_date_calendardoc_date_standardundate_standardundate_origorig_date_precisiontypeundate_weekdayundate_weekday_nameorig_weekday
52716947Monday 3 Iyyar 1740Seleucid1429-04-071429-04-071740-02-03dayLegal document3ThursdayMonday
58547637Monday night, 29 Ṭevet 1438Seleucid112711271438-10-29dayLegal document4FridayTuesday
864811227Monday 24 Jumādā I 517Hijrī1123-07-201123-07-200517-05-24dayParaliterary text4FridayMonday
1639719649Thursday 26 Iyyar 5306Anno Mundi1546-04-281546-04-285306-02-26dayLegal document2WednesdayThursday
1772321094Saturday 20 Rajab 550Hijrī1155-09-191155-09-190550-07-20dayLegal document0MondaySaturday
2309927479Tuesday 11 Tammuz 5525Anno Mundi1765-06-301765-06-305525-04-11dayLegal document6SundayTuesday
2310427484Friday 20th Shevat 5405Anno Mundi164516455405-11-20dayLegal document3ThursdayFriday
2310527485Sunday 22 Adar 5590Anno Mundi1830-03-171830-03-175590-12-22dayLegal document2WednesdaySunday
2310727487Thursday 15 Shevat 5450Anno Mundi1690-01-251690-01-255450-11-15dayLegal document2WednesdayThursday
2310927489Sunday 6 Nisan 5528Anno Mundi1768-03-241768-03-245528-01-06dayLegal document3ThursdaySunday
2311027490Thursday 19th Elul 5428Anno Mundi166816685428-06-19dayLegal document6SundayThursday
2311127491Tuesday 1 Kislev 5507Anno Mundi1746-11-141746-11-145507-09-01dayLegal document0MondayTuesday
2311627496Sunday 28 Elul 5511Anno Mundi1751-09-181751-09-185511-06-28dayLegal document5SaturdaySunday
2311727497Sunday 17th Sivan 5423Anno Mundi166316635423-03-17dayLegal document4FridaySunday
2311827498Sunday 25th Tevet 5409Anno Mundi164816485409-10-25dayLegal document5SaturdaySunday
2312027500Thursday 4 Sivan 5516Anno Mundi1756-06-021756-06-025516-03-04dayLegal document2WednesdayThursday
2312727507Sunday 25 Sivan 5556Anno Mundi1796-07-011796-07-015556-03-25dayLegal document4FridaySunday
2313127511Wednesday 28th Tevet 5399Anno Mundi164016405399-10-28dayLegal document1TuesdayWednesday
2313527515Monday 15th Iyyar 5414Anno Mundi165416545414-02-15dayLegal document5SaturdayMonday
2313627516Thursday 24 Nisan 5481Anno Mundi1721-04-211721-04-215481-01-24dayLegal document0MondayThursday
\n", + "
" + ], + "text/plain": [ + " pgpid doc_date_original doc_date_calendar doc_date_standard \\\n", + "5271 6947 Monday 3 Iyyar 1740 Seleucid 1429-04-07 \n", + "5854 7637 Monday night, 29 Ṭevet 1438 Seleucid 1127 \n", + "8648 11227 Monday 24 Jumādā I 517 Hijrī 1123-07-20 \n", + "16397 19649 Thursday 26 Iyyar 5306 Anno Mundi 1546-04-28 \n", + "17723 21094 Saturday 20 Rajab 550 Hijrī 1155-09-19 \n", + "23099 27479 Tuesday 11 Tammuz 5525 Anno Mundi 1765-06-30 \n", + "23104 27484 Friday 20th Shevat 5405 Anno Mundi 1645 \n", + "23105 27485 Sunday 22 Adar 5590 Anno Mundi 1830-03-17 \n", + "23107 27487 Thursday 15 Shevat 5450 Anno Mundi 1690-01-25 \n", + "23109 27489 Sunday 6 Nisan 5528 Anno Mundi 1768-03-24 \n", + "23110 27490 Thursday 19th Elul 5428 Anno Mundi 1668 \n", + "23111 27491 Tuesday 1 Kislev 5507 Anno Mundi 1746-11-14 \n", + "23116 27496 Sunday 28 Elul 5511 Anno Mundi 1751-09-18 \n", + "23117 27497 Sunday 17th Sivan 5423 Anno Mundi 1663 \n", + "23118 27498 Sunday 25th Tevet 5409 Anno Mundi 1648 \n", + "23120 27500 Thursday 4 Sivan 5516 Anno Mundi 1756-06-02 \n", + "23127 27507 Sunday 25 Sivan 5556 Anno Mundi 1796-07-01 \n", + "23131 27511 Wednesday 28th Tevet 5399 Anno Mundi 1640 \n", + "23135 27515 Monday 15th Iyyar 5414 Anno Mundi 1654 \n", + "23136 27516 Thursday 24 Nisan 5481 Anno Mundi 1721-04-21 \n", + "\n", + " undate_standard undate_orig orig_date_precision type \\\n", + "5271 1429-04-07 1740-02-03 day Legal document \n", + "5854 1127 1438-10-29 day Legal document \n", + "8648 1123-07-20 0517-05-24 day Paraliterary text \n", + "16397 1546-04-28 5306-02-26 day Legal document \n", + "17723 1155-09-19 0550-07-20 day Legal document \n", + "23099 1765-06-30 5525-04-11 day Legal document \n", + "23104 1645 5405-11-20 day Legal document \n", + "23105 1830-03-17 5590-12-22 day Legal document \n", + "23107 1690-01-25 5450-11-15 day Legal document \n", + "23109 1768-03-24 5528-01-06 day Legal document \n", + "23110 1668 5428-06-19 day Legal document \n", + "23111 1746-11-14 5507-09-01 day Legal document \n", + "23116 1751-09-18 5511-06-28 day Legal document \n", + "23117 1663 5423-03-17 day Legal document \n", + "23118 1648 5409-10-25 day Legal document \n", + "23120 1756-06-02 5516-03-04 day Legal document \n", + "23127 1796-07-01 5556-03-25 day Legal document \n", + "23131 1640 5399-10-28 day Legal document \n", + "23135 1654 5414-02-15 day Legal document \n", + "23136 1721-04-21 5481-01-24 day Legal document \n", + "\n", + " undate_weekday undate_weekday_name orig_weekday \n", + "5271 3 Thursday Monday \n", + "5854 4 Friday Tuesday \n", + "8648 4 Friday Monday \n", + "16397 2 Wednesday Thursday \n", + "17723 0 Monday Saturday \n", + "23099 6 Sunday Tuesday \n", + "23104 3 Thursday Friday \n", + "23105 2 Wednesday Sunday \n", + "23107 2 Wednesday Thursday \n", + "23109 3 Thursday Sunday \n", + "23110 6 Sunday Thursday \n", + "23111 0 Monday Tuesday \n", + "23116 5 Saturday Sunday \n", + "23117 4 Friday Sunday \n", + "23118 5 Saturday Sunday \n", + "23120 2 Wednesday Thursday \n", + "23127 4 Friday Sunday \n", + "23131 1 Tuesday Wednesday \n", + "23135 5 Saturday Monday \n", + "23136 0 Monday Thursday " + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "matches = weekday_dates[weekday_dates.undate_weekday_name == weekday_dates.orig_weekday]\n", + "\n", + "mismatches = weekday_dates[weekday_dates.undate_weekday_name != weekday_dates.orig_weekday]\n", + "\n", + "print(f\"{len(matches)} matches, {len(mismatches)} mismatches ({(len(matches)/(len(matches)+len(mismatches)))*100:0.2f}%)\")\n", + "mismatches.head(20)" + ] + }, + { + "cell_type": "markdown", + "id": "492352af-76db-47b5-afa2-f5388c4d1d71", + "metadata": {}, + "source": [ + "Is there any noticable difference about where the mismatches are coming from based on calendar or day of week?" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "d6476907-1628-4d68-ab1f-43c95e123707", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "doc_date_calendar\n", + "Anno Mundi 55\n", + "Seleucid 3\n", + "Hijrī 2\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mismatches.doc_date_calendar.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "18b71d18-5d5b-4f92-8801-499bcf412efe", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "orig_weekday\n", + "Wednesday 17\n", + "Sunday 12\n", + "Monday 10\n", + "Thursday 9\n", + "Tuesday 7\n", + "Friday 4\n", + "Saturday 1\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mismatches.orig_weekday.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "eb7ea065-e4b5-47aa-9538-8dc9851ea572", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1 mismatches that include text 'night'\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pgpiddoc_date_originaldoc_date_calendardoc_date_standardundate_standardundate_origorig_date_precisiontypeundate_weekdayundate_weekday_nameorig_weekday
58547637Monday night, 29 Ṭevet 1438Seleucid112711271438-10-29dayLegal document4FridayTuesday
\n", + "
" + ], + "text/plain": [ + " pgpid doc_date_original doc_date_calendar doc_date_standard \\\n", + "5854 7637 Monday night, 29 Ṭevet 1438 Seleucid 1127 \n", + "\n", + " undate_standard undate_orig orig_date_precision type \\\n", + "5854 1127 1438-10-29 day Legal document \n", + "\n", + " undate_weekday undate_weekday_name orig_weekday \n", + "5854 4 Friday Tuesday " + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# how many mismatches are due to night?\n", + "night_mismatches = mismatches[mismatches.doc_date_original.str.contains(\" night\")]\n", + "print(f\"{len(night_mismatches)} mismatches that include text 'night'\")\n", + "night_mismatches" + ] + }, + { + "cell_type": "markdown", + "id": "16f9a9db-434f-407e-8613-42941b4f3a14", + "metadata": {}, + "source": [ + "### Plot document frequency by day\n", + "\n", + "Because we're preserving as much date information as possible, we can plost based on things like weekday - even across different calendars.\n", + "\n", + "For documents with day-level date precision, how are they distributed by weekday?" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "ece780b8-2eb2-4cbc-9195-27def665f7fa", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# get numeric weekday\n", + "orig_dates_parsed['undate_weekday'] = orig_dates_parsed.undate_orig.apply(lambda x: x.earliest.weekday)\n", + "orig_dates_parsed['undate_weekday_name'] = orig_dates_parsed.undate_weekday.apply(lambda x: days[x])\n", + "\n", + "# restrict to dates with day precision; the rest are just using earliest day\n", + "orig_dates_days = orig_dates_parsed[orig_dates_parsed.orig_date_precision == 'day']\n", + "\n", + "alt.Chart(orig_dates_days[['undate_weekday', 'undate_weekday_name', 'pgpid']]).mark_rect().encode(\n", + " alt.X('undate_weekday_name', sort=days, title='weekday'),\n", + " alt.Color('count(pgpid)', title='# of documents')\n", + ").properties(title='document frequency by weekday')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "6b2f24de-18ce-4f40-b300-e8cc334a338c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "undate_weekday_name\n", + "Monday 305\n", + "Thursday 282\n", + "Tuesday 241\n", + "Sunday 229\n", + "Wednesday 229\n", + "Friday 215\n", + "Saturday 98\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "orig_dates_days.undate_weekday_name.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "dea83b43-b379-4807-8a33-8e26d7f4f8e7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.FacetChart(...)" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "weekday_calendar_chart = alt.Chart(weekday_dates[['undate_weekday', 'undate_weekday_name', 'pgpid', 'doc_date_calendar']]).mark_rect().encode(\n", + " alt.X('undate_weekday_name', sort=days, title='weekday'),\n", + " # alt.Y('doc_date_calendar'),\n", + " alt.Color('count(pgpid)')\n", + ").facet(row=alt.Facet('doc_date_calendar', title=\"Original Calendar\")).properties(title='document frequency by weekday and calendar')\n", + "weekday_calendar_chart" + ] + }, + { + "cell_type": "markdown", + "id": "484069be-8f75-4197-8f96-4683ab509028", + "metadata": {}, + "source": [ + "This chart is skewed due to the fact we have so many more day-precision dates from the Hebrew calendar than any other. " + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "cfecdb64-03b4-405b-b1f3-85e876f55680", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "doc_date_calendar\n", + "Anno Mundi 82\n", + "Seleucid 20\n", + "Hijrī 2\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "weekday_dates.doc_date_calendar.value_counts()" + ] + }, + { + "cell_type": "markdown", + "id": "bfdfcf6b-d572-4f9b-8538-eca932f50942", + "metadata": {}, + "source": [ + "This is more obvious if we use indepenend color scales." + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "e66917b0-2221-42dd-a99b-df847b8e815b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.FacetChart(...)" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "weekday_calendar_chart.resolve_scale(color='independent')" + ] + }, + { + "cell_type": "markdown", + "id": "8e2a74a1-546b-4069-bff5-29788dee8997", + "metadata": {}, + "source": [ + "What about weekday by centuy? " + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "6a7a0bf5-f8c2-4034-8495-2fb4b297740a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pgpiddoc_date_originaldoc_date_calendardoc_date_standardundate_standardundate_origorig_date_precisiontypeundate_weekdayundate_weekday_nameorig_weekdaycentury
8511377Wednesday night, 28 Sivan 1581Seleucid127012701581-03-28dayLegal document3ThursdayThursday1200s
17142418Monday 20 Tevet 1520Seleucid1208-12-291208-12-291520-10-20dayLegal document0MondayMonday1200s
19292649Sunday night, 25 Kislev 1444Seleucid113311331444-09-25dayLegal document0MondayMonday1100s
20132739Wednesday 29th Elul 1354Seleucid1043-09-071043-09-071354-06-29dayLegal document2WednesdayWednesday1000s
32574026Wednesday night, 29 Tishrei 1541Seleucid1229-09-181229-09-181541-07-29dayLegal document3ThursdayThursday1200s
.......................................
2930334623Sunday night, 20 Ṭevet 1578Seleucid1266/12671266/12671578-10-20dayLegal document0MondayMonday1200s
2992435264Wednesday 13 Ṭevet 1526Seleucid1214/12151214/12151526-10-13dayLegal document2WednesdayWednesday1200s
3400839564Monday 16 Tevet 1339Seleucid1027-12-181027-12-181339-10-16dayLegal document0MondayMonday1000s
3446640035Monday 1st Iyyar 1437Seleucid1126-04-261126-04-261437-02-01dayLegal document0MondayMonday1100s
3446740036Friday 15 of Adar 1443Seleucid1132-03-041132-03-041443-12-15dayLegal document4FridayFriday1100s
\n", + "

104 rows × 12 columns

\n", + "
" + ], + "text/plain": [ + " pgpid doc_date_original doc_date_calendar \\\n", + "851 1377 Wednesday night, 28 Sivan 1581 Seleucid \n", + "1714 2418 Monday 20 Tevet 1520 Seleucid \n", + "1929 2649 Sunday night, 25 Kislev 1444 Seleucid \n", + "2013 2739 Wednesday 29th Elul 1354 Seleucid \n", + "3257 4026 Wednesday night, 29 Tishrei 1541 Seleucid \n", + "... ... ... ... \n", + "29303 34623 Sunday night, 20 Ṭevet 1578 Seleucid \n", + "29924 35264 Wednesday 13 Ṭevet 1526 Seleucid \n", + "34008 39564 Monday 16 Tevet 1339 Seleucid \n", + "34466 40035 Monday 1st Iyyar 1437 Seleucid \n", + "34467 40036 Friday 15 of Adar 1443 Seleucid \n", + "\n", + " doc_date_standard undate_standard undate_orig orig_date_precision \\\n", + "851 1270 1270 1581-03-28 day \n", + "1714 1208-12-29 1208-12-29 1520-10-20 day \n", + "1929 1133 1133 1444-09-25 day \n", + "2013 1043-09-07 1043-09-07 1354-06-29 day \n", + "3257 1229-09-18 1229-09-18 1541-07-29 day \n", + "... ... ... ... ... \n", + "29303 1266/1267 1266/1267 1578-10-20 day \n", + "29924 1214/1215 1214/1215 1526-10-13 day \n", + "34008 1027-12-18 1027-12-18 1339-10-16 day \n", + "34466 1126-04-26 1126-04-26 1437-02-01 day \n", + "34467 1132-03-04 1132-03-04 1443-12-15 day \n", + "\n", + " type undate_weekday undate_weekday_name orig_weekday century \n", + "851 Legal document 3 Thursday Thursday 1200s \n", + "1714 Legal document 0 Monday Monday 1200s \n", + "1929 Legal document 0 Monday Monday 1100s \n", + "2013 Legal document 2 Wednesday Wednesday 1000s \n", + "3257 Legal document 3 Thursday Thursday 1200s \n", + "... ... ... ... ... ... \n", + "29303 Legal document 0 Monday Monday 1200s \n", + "29924 Legal document 2 Wednesday Wednesday 1200s \n", + "34008 Legal document 0 Monday Monday 1000s \n", + "34466 Legal document 0 Monday Monday 1100s \n", + "34467 Legal document 4 Friday Friday 1100s \n", + "\n", + "[104 rows x 12 columns]" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# get rough century (gregorian calendar)\n", + "weekday_dates['century'] = orig_dates_days.undate_orig.apply(lambda x: (\"%04d\" % x.earliest.year)[:2] + \"00s\")\n", + "\n", + "weekday_dates[['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard', 'undate_standard', 'undate_orig', 'century']].head()\n", + "weekday_dates" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "eb99871e-d9a5-4211-9bd2-5a9acfe8face", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "alt.Chart(weekday_dates[['undate_weekday', 'undate_weekday_name', 'pgpid', 'century']]).mark_rect().encode(\n", + " alt.X('undate_weekday_name', sort=days, title='weekday'),\n", + " alt.Y('century'),\n", + " alt.Color('count(pgpid)')\n", + ").properties(title='document frequency by weekday and century')\n" + ] + }, + { + "cell_type": "markdown", + "id": "cfd1e93b-1286-43d9-be44-34ba607435e1", + "metadata": {}, + "source": [ + "The weekday + century heatmap suggets we're more likely to have day-level precision dates from the 1700s than any other time period in the dataset." + ] + }, + { + "cell_type": "markdown", + "id": "2ec7d437-092f-47de-b60c-a1b72f45b4dd", + "metadata": {}, + "source": [ + "## Plot frequency by month and calendar" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "08a58fcf-2b08-441b-9dc8-385bafeb88e6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.FacetChart(...)" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# what about heat map by month?\n", + "\n", + "# get numeric month\n", + "orig_dates_parsed['undate_month'] = orig_dates_parsed.undate_orig.apply(lambda x: x.month)\n", + "# orig_dates_parsed['undate_weekday_name'] = orig_dates_parsed.undate_weekday.apply(lambda x: days[x])\n", + "\n", + "has_month = orig_dates_parsed[orig_dates_parsed.undate_month.notna()]\n", + "\n", + "alt.Chart(has_month[['undate_month', 'pgpid', 'doc_date_calendar']]).mark_rect().encode(\n", + " alt.X('undate_month', title='month'),\n", + " alt.Color('count(pgpid)', title='# of documents')\n", + ").facet(\n", + " row=alt.Facet('doc_date_calendar', title=\"Original Calendar\")\n", + ").properties(title='Document frequency by month and calendar')" + ] + }, + { + "cell_type": "markdown", + "id": "2ad489d5-483d-4280-a7d8-0090fdd2aa32", + "metadata": {}, + "source": [ + "That very light month 13 in the Hebrew and Seleucid calendars reflects the fact that the Hebrew calendar has a leap _month_." + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "a7a16c53-6f01-4457-9458-4fcf80a35c51", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "doc_date_calendar\n", + "Seleucid 1196\n", + "Anno Mundi 903\n", + "Hijrī 516\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "has_month.doc_date_calendar.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "65bce74e-67b7-48df-9f7f-a6f264af4f11", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1593, 38)" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "orig_dates_days[orig_dates_days.undate_weekday_name.notna()].shape" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "ac940883-e00e-4dde-8339-95a1b733f6f3", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/mb/6qm4h4yx3yqdy2bv2sjyp4z00000gp/T/ipykernel_69693/2787254306.py:3: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " orig_dates_days['undate_month'] = orig_dates_days.undate_orig.apply(lambda x: x.month)\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.FacetChart(...)" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# weekday frequency by month?\n", + "\n", + "orig_dates_days['undate_month'] = orig_dates_days.undate_orig.apply(lambda x: x.month)\n", + "\n", + "alt.Chart(orig_dates_days[['undate_weekday', 'undate_weekday_name', 'pgpid', 'undate_month', 'doc_date_calendar']]).mark_rect().encode(\n", + " alt.X('undate_weekday_name', sort=days, title='weekday'),\n", + " alt.Y('undate_month', title=\"month\"),\n", + " alt.Color('count(pgpid)')\n", + ").facet(\n", + " column=alt.Facet('doc_date_calendar', title=\"Original Calendar\")\n", + ").properties(title='Document frequency by weekday and month (1,557 documents)')\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/pyproject.toml b/pyproject.toml index 2dc6515..ef2fe99 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,7 +49,7 @@ classifiers = [ [project.optional-dependencies] docs = ["sphinx>=7.0.0", "alabaster", "myst-parser", "myst-parser[linkify]"] test = ["pytest>=7.2", "pytest-ordering", "pytest-cov"] -notebooks = ["jupyterlab", "pandas", "treon"] +notebooks = ["jupyterlab", "pandas", "treon", "altair"] check = ["undate[docs]", "undate[notebooks]", "mypy", "ruff"] dev = [ "pre-commit>=2.20.0", diff --git a/src/undate/converters/calendars/__init__.py b/src/undate/converters/calendars/__init__.py index a43a270..5836b2f 100644 --- a/src/undate/converters/calendars/__init__.py +++ b/src/undate/converters/calendars/__init__.py @@ -1,5 +1,11 @@ from undate.converters.calendars.gregorian import GregorianDateConverter from undate.converters.calendars.hebrew import HebrewDateConverter from undate.converters.calendars.islamic import IslamicDateConverter +from undate.converters.calendars.seleucid import SeleucidDateConverter -__all__ = ["GregorianDateConverter", "HebrewDateConverter", "IslamicDateConverter"] +__all__ = [ + "GregorianDateConverter", + "HebrewDateConverter", + "IslamicDateConverter", + "SeleucidDateConverter", +] diff --git a/src/undate/converters/calendars/hebrew/hebrew.lark b/src/undate/converters/calendars/hebrew/hebrew.lark index b55ec3f..6f4244c 100644 --- a/src/undate/converters/calendars/hebrew/hebrew.lark +++ b/src/undate/converters/calendars/hebrew/hebrew.lark @@ -3,7 +3,7 @@ // only support day month year format for now // parser requires numeric day and year to be distinguished based on order -hebrew_date: day month year | month year | year +hebrew_date: weekday? day month comma? year | month year | year // TODO: handle date ranges? @@ -27,10 +27,14 @@ month: month_1 | month_10 | month_11 | month_12 - | month_13 + | month_13 // months have 29 or 30 days; we do not expect leading zeroes day: /[1-9]/ | /[12][0-9]/ | /30/ +comma: "," +weekday: ("Monday" | "Tuesday" | "Wednesday" | "Thursday" | "Friday" | "Saturday" | "Sunday") comma? + + // months, in order; from convertdate list // with variants from Princeton Geniza Project // support matching with and without accents @@ -43,11 +47,13 @@ month_5: "Av" month_6: "Elul" // Tishrei or Tishri month_7: /Tishre?i/ -month_8: "Heshvan" +// Heshvan, Ḥeshvan, Marḥeshvan +month_8: /(Mar)?[ḤHḥ]eshvan/ month_9: "Kislev" // Tevet or Teveth month_10: /[ṬT]eveth?/ -month_11: "Shevat" +// Shevat or Shevaṭ +month_11: /Sheva[tṭ]/ // Adar I or Adar month_12: /Adar( I)?/ // Adar II or Adar Bet diff --git a/src/undate/converters/calendars/hebrew/transformer.py b/src/undate/converters/calendars/hebrew/transformer.py index 48e8b20..8880434 100644 --- a/src/undate/converters/calendars/hebrew/transformer.py +++ b/src/undate/converters/calendars/hebrew/transformer.py @@ -13,6 +13,8 @@ class HebrewDateTransformer(Transformer): """Transform a Hebrew date parse tree and return an Undate or UndateInterval.""" + calendar = Calendar.HEBREW + def hebrew_date(self, items): parts = {} for child in items: @@ -22,9 +24,9 @@ def hebrew_date(self, items): value = int(child.children[0]) parts[str(child.data)] = value - # initialize and return an undate with islamic year, month, day and - # islamic calendar - return HebrewUndate(**parts) + # initialize and return an undate with year, month, day and + # configured calendar (hebrew by default) + return Undate(**parts, calendar=self.calendar) # year translation is not needed since we want a tree with name year # this is equivalent to a no-op diff --git a/src/undate/converters/calendars/islamic/islamic.lark b/src/undate/converters/calendars/islamic/islamic.lark index 3ad59a5..1e4940b 100644 --- a/src/undate/converters/calendars/islamic/islamic.lark +++ b/src/undate/converters/calendars/islamic/islamic.lark @@ -3,7 +3,7 @@ // only support day month year format for now // parser requires numeric day and year to be distinguished based on order -islamic_date: day month year | month year | year +islamic_date: weekday? day month year | month year | year // TODO: handle date ranges? @@ -13,6 +13,7 @@ islamic_date: day month year | month year | year year: /\d+/ + // months month: month_1 | month_2 @@ -29,6 +30,10 @@ month: month_1 // months have 29 or 30 days; we do not expect leading zeroes day: /[1-9]/ | /[12][0-9]/ | /30/ + +comma: "," +weekday: ("Monday" | "Tuesday" | "Wednesday" | "Thursday" | "Friday" | "Saturday" | "Sunday") comma? + // months, in order; from convertdate list // with variants from Princeton Geniza Project // support matching with and without accents @@ -42,7 +47,7 @@ month_4: /Rab[īi][ʿ'] (ath-Th[āa]n[īi]|II)/ // Jumādā al-ʾAwwal or Jumādā I month_5: /Jum[āa]d[āa] (al-[ʾ`]Awwal|I)/ // Jumādā ath-Thāniya or Jumādā II -month_6: /Jum[āa]d[āa] (ath-Th[āa]niyah|II)/ +month_6: /Jum[āa][dḍ][āa] (ath-Th[āa]niyah|II)/ month_7: "Rajab" // Shaʿbān month_8: /Sha[ʿ']b[āa]n/ diff --git a/src/undate/converters/calendars/seleucid.py b/src/undate/converters/calendars/seleucid.py new file mode 100644 index 0000000..bddf867 --- /dev/null +++ b/src/undate/converters/calendars/seleucid.py @@ -0,0 +1,24 @@ +from undate.converters.calendars import HebrewDateConverter +from undate.undate import Calendar + + +class SeleucidDateConverter(HebrewDateConverter): + #: offset for Seleucid calendar: Seleucid year + 3449 = Anno Mundi year + SELEUCID_OFFSET = 3449 + + #: converter name: Seleucid + name: str = "Seleucid" + calendar_name: str = "Seleucid" + + def __init__(self): + super().__init__() + # override hebrew calendar to initialize undates with seleucid + # calendar; this triggers Seleucid calendar to_gregorian method use + self.transformer.calendar = Calendar.SELEUCID + + def to_gregorian(self, year: int, month: int, day: int) -> tuple[int, int, int]: + """Convert a Seleucid date, specified by year, month, and day, + to the Gregorian equivalent date. Uses hebrew calendar conversion + logic with :attr:`SELEUCID_OFFSET`. Returns a tuple of year, month, day. + """ + return super().to_gregorian(year + self.SELEUCID_OFFSET, month, day) diff --git a/src/undate/date.py b/src/undate/date.py index 27f6efa..c953995 100644 --- a/src/undate/date.py +++ b/src/undate/date.py @@ -104,6 +104,27 @@ def day(self) -> Optional[int]: return int(str(self.astype("datetime64[D]")).split("-")[-1]) return None + @property + def weekday(self) -> Optional[int]: + """Equivalent to :meth:`datetime.date.weekday`; returns day of week as an + integer where Monday is 0 and Sunday is 6. Only supported for dates + with date unit in days. + """ + # only return a weekday if date unit is in days + if self.dtype == "datetime64[D]": + # calculate based on difference between current day and week start + # numpy datetime weeks start on thursdays - presumably since + # unix epoch day zero was a thursday... + + # implementation inspired in part by https://stackoverflow.com/a/54264187 + + thursday_week = self.astype("datetime64[W]") + days_from_thursday = (self - thursday_week).astype(int) + # if monday is 0, thursday is 3 + return (days_from_thursday + 3) % 7 + + return None + def __sub__(self, other): # modify to conditionally return a timedelta object instead of a # Date object with dtype timedelta64[D] (default behavior) diff --git a/src/undate/undate.py b/src/undate/undate.py index be4454a..dc4d506 100644 --- a/src/undate/undate.py +++ b/src/undate/undate.py @@ -29,6 +29,7 @@ class Calendar(StrEnum): GREGORIAN = auto() HEBREW = auto() ISLAMIC = auto() + SELEUCID = auto() @staticmethod def get_converter(calendar): @@ -96,7 +97,6 @@ def __init__( if calendar is not None: self.set_calendar(calendar) self.calendar_converter = Calendar.get_converter(self.calendar) - self.calculate_earliest_latest(year, month, day) if converter is None: @@ -192,9 +192,12 @@ def calculate_earliest_latest(self, year, month, day): ) def set_calendar(self, calendar: Union[str, Calendar]): + """Find calendar by name if passed as string and set on the object. + Only intended for use at initialization time; use :meth:`as_calendar` + to change calendar.""" if calendar is not None: # if not passed as a Calendar instance, do a lookup - if not isinstance(calendar, Calendar): + if isinstance(calendar, str): # look for calendar by upper-case name try: calendar = Calendar[calendar.upper()] @@ -202,6 +205,19 @@ def set_calendar(self, calendar: Union[str, Calendar]): raise ValueError(f"Calendar `{calendar}` is not supported") from err self.calendar = calendar + def as_calendar(self, calendar: Union[str, Calendar]): + """Return a new :class:`Undate` object with the same year, month, day, and labels + used to initialize the current object, but with a different calendar. Note that this + does NOT do calendar conversion, but reinterprets current numeric year, month, day values + according to the new calendar.""" + return Undate( + year=self.initial_values.get("year"), + month=self.initial_values.get("month"), + day=self.initial_values.get("day"), + label=self.label, + calendar=calendar, + ) + def __str__(self) -> str: # if any portion of the date is partially known, construct # pseudo ISO8601 format here, since ISO8601 doesn't support unknown digits @@ -319,8 +335,12 @@ def __lt__(self, other: object) -> bool: # (e.g., single date within the same year) # comparison for those cases is not currently supported elif other in self or self in other: + # sort by precision, most precise first + by_precision = sorted( + [self, other], key=lambda x: x.precision, reverse=True + ) raise NotImplementedError( - "Can't compare when one date falls within the other" + f"Can't compare when one date ({by_precision[0]}) falls within the other ({by_precision[1]})" ) # NOTE: unsupported comparisons are supposed to return NotImplemented # However, doing that in this case results in a confusing TypeError! @@ -405,7 +425,7 @@ def year(self) -> Optional[str]: year = self._get_date_part("year") if year: return f"{year:0>4}" - # if value is unset but date precision is month or greater, return unknown month + # if value is unset but date precision is year or greater, return unknown year elif self.precision >= DatePrecision.YEAR: return self.MISSING_DIGIT * 4 return None diff --git a/tests/test_converters/test_calendars/test_hebrew/test_hebrew_transformer.py b/tests/test_converters/test_calendars/test_hebrew/test_hebrew_transformer.py index 6e4a5e6..7dcca83 100644 --- a/tests/test_converters/test_calendars/test_hebrew/test_hebrew_transformer.py +++ b/tests/test_converters/test_calendars/test_hebrew/test_hebrew_transformer.py @@ -26,6 +26,12 @@ def test_hebrew_undate(): ("5362", HebrewUndate(5362), DatePrecision.YEAR), # add when we support parsing ranges: # Adar I and Adar II 5453 : (1693 CE) + # support weekdays included in text + ("Thursday, 12 Sivan 4795", HebrewUndate(4795, 3, 12), DatePrecision.DAY), + # with or without comma + ("Thursday 12 Sivan 4795", HebrewUndate(4795, 3, 12), DatePrecision.DAY), + # huh, current parsing completely ignores whitespace; do we want that? + ("Thursday12Sivan4795", HebrewUndate(4795, 3, 12), DatePrecision.DAY), ] diff --git a/tests/test_converters/test_calendars/test_islamic/test_islamic_transformer.py b/tests/test_converters/test_calendars/test_islamic/test_islamic_transformer.py index 951a9f8..04ff53b 100644 --- a/tests/test_converters/test_calendars/test_islamic/test_islamic_transformer.py +++ b/tests/test_converters/test_calendars/test_islamic/test_islamic_transformer.py @@ -28,6 +28,7 @@ def test_islamic_undate(): # examples from ISMI data (reformatted to day month year) # Rabi 1 = month 3 ("14 Rabīʿ I 901", IslamicUndate(901, 3, 14), DatePrecision.DAY), + ("Rabīʿ I 490", IslamicUndate(490, 3), DatePrecision.MONTH), ("884", IslamicUndate(884), DatePrecision.YEAR), # Gregorian: UndateInterval(Undate(1479, 4, 3), Undate(1480, 3, 21)), # add when we support parsing ranges: diff --git a/tests/test_converters/test_calendars/test_seleucid.py b/tests/test_converters/test_calendars/test_seleucid.py new file mode 100644 index 0000000..fd8bc82 --- /dev/null +++ b/tests/test_converters/test_calendars/test_seleucid.py @@ -0,0 +1,109 @@ +from undate.converters.calendars import SeleucidDateConverter +from undate.date import Date, DatePrecision +from undate.undate import Calendar, Undate + + +class TestSeleucidDateConverter: + def test_parse(self): + # day + # Elul = month 6; 11 September, 1000 Gregorian + date_str = "29 Elul 1311" + date = SeleucidDateConverter().parse(date_str) + assert date == Undate(1311, 6, 29, calendar="Seleucid") + assert date.calendar == Calendar.SELEUCID + assert date.precision == DatePrecision.DAY + assert date.label == f"{date_str} {SeleucidDateConverter.calendar_name}" + + date_str = "Tishri 1458" # month 7 + date = SeleucidDateConverter().parse(date_str) + assert date == Undate(1458, 7, calendar="Seleucid") + assert date.calendar == Calendar.SELEUCID + assert date.precision == DatePrecision.MONTH + assert date.label == f"{date_str} {SeleucidDateConverter.calendar_name}" + + # year + date_str = "1458" + date = SeleucidDateConverter().parse(date_str) + assert date == Undate(1458, calendar="Seleucid") + assert date.calendar == Calendar.SELEUCID + assert date.precision == DatePrecision.YEAR + assert date.label == f"{date_str} {SeleucidDateConverter.calendar_name}" + + def test_gregorian_earliest_latest(self): + # earliest/latest should be converted to Gregorian for comparison + + # full date + # Elul = month 6 (7 September, 1000 Gregorian) + date_str = "29 Elul 1311" + date = SeleucidDateConverter().parse(date_str) + assert date.earliest == Date(1000, 9, 7) + assert date.latest == Date(1000, 9, 7) + assert date.label == f"{date_str} {SeleucidDateConverter.calendar_name}" + + date_str = "23 Adar I 1475" + date = SeleucidDateConverter().parse(date_str) + assert date.earliest == Date(1164, 2, 25) + assert date.latest == Date(1164, 2, 25) + assert date.label == f"{date_str} {SeleucidDateConverter.calendar_name}" + + # month/year + date_str = "Tishri 1458" + date = SeleucidDateConverter().parse(date_str) + assert date.earliest == Date(1146, 9, 16) + assert date.latest == Date(1146, 10, 15) + assert date.label == f"{date_str} {SeleucidDateConverter.calendar_name}" + + +# TODO: update validation error to say seleucid instead of hebrew + +# seleucid_year = 1458 +# converted_date = convert_seleucid_date(f"Tishri {seleucid_year}") +# converted_date_am = convert_hebrew_date( +# f"Tishrei {seleucid_year + Calendar.SELEUCID_OFFSET}" +# ) +# # the converted date range for Tishri Sel. should be the same as that for Tishri AM - 3449 years. +# assert converted_date[0] == converted_date_am[0] +# assert converted_date[1] == converted_date_am[1] + +# # leap day (Feb 29, 2020) should convert properlyd +# converted_date = convert_seleucid_date("4 Adar 2331") +# assert converted_date[1] == date(2020, 2, 29) + + +# # 26 Tammuz 4816: 17 July, 1056; Tammuz = month 4 +# date = Undate(4816, 4, 26, calendar="Seleucid") +# assert date.earliest == Date(1056, 7, 17) +# assert date.latest == Date(1056, 7, 17) +# # 13 Tishrei 5416 Anno Mundi (1655-10-14) +# date = Undate(5416, 7, 13, calendar="Seleucid") # Tishrei = month 7 +# assert date.earliest == Date(1655, 10, 14) +# assert date.latest == Date(1655, 10, 14) + + +# from pgp tests + + +# # month/year +# seleucid_year = 1458 +# converted_date = convert_seleucid_date(f"Tishri {seleucid_year}") +# converted_date_am = convert_hebrew_date( +# f"Tishrei {seleucid_year + Calendar.SELEUCID_OFFSET}" +# ) +# # the converted date range for Tishri Sel. should be the same as that for Tishri AM - 3449 years. +# assert converted_date[0] == converted_date_am[0] +# assert converted_date[1] == converted_date_am[1] + +# # leap day (Feb 29, 2020) should convert properly +# converted_date = convert_seleucid_date("4 Adar 2331") +# assert converted_date[1] == date(2020, 2, 29) + +# # leap year (4826 AM = 1377 Seleucid) should convert properly +# seleucid_year = 1377 +# converted_date = convert_seleucid_date(f"21 Adar II {seleucid_year}") +# converted_date_am = convert_hebrew_date( +# f"21 Adar II {seleucid_year + Calendar.SELEUCID_OFFSET}" +# ) +# assert converted_date[0] == converted_date_am[0] +# assert converted_date[1] == converted_date_am[1] +# # and it should be converted to 1066-03-21 CE +# assert converted_date[1] == date(1066, 3, 21) diff --git a/tests/test_date.py b/tests/test_date.py index 5ff017d..d5c7d7b 100644 --- a/tests/test_date.py +++ b/tests/test_date.py @@ -1,3 +1,5 @@ +import datetime + import numpy as np from undate.date import ONE_YEAR, Date, DatePrecision, Timedelta @@ -51,6 +53,31 @@ def test_properties_day(self): assert Date(2010, 5).day is None assert Date(2021, 6, 15).day == 15 + def test_weekday(self): + # thursday + assert Date(2025, 1, 2).weekday == 3 + assert Date(2025, 1, 2).weekday == datetime.date(2025, 1, 2).weekday() + # friday + assert Date(2025, 1, 3).weekday == 4 + assert Date(2025, 1, 3).weekday == datetime.date(2025, 1, 3).weekday() + # saturday + assert Date(2025, 1, 4).weekday == 5 + assert Date(2025, 1, 4).weekday == datetime.date(2025, 1, 4).weekday() + # sunday + assert Date(2025, 1, 5).weekday == 6 + assert Date(2025, 1, 5).weekday == datetime.date(2025, 1, 5).weekday() + # monday + assert Date(2025, 1, 6).weekday == 0 + assert Date(2025, 1, 6).weekday == datetime.date(2025, 1, 6).weekday() + # tuesday + assert Date(2025, 1, 7).weekday == 1 + assert Date(2025, 1, 7).weekday == datetime.date(2025, 1, 7).weekday() + + # when a date is not day-level precision, no weekday is returned + yearonly_date = Date(2025) + assert yearonly_date.dtype == "datetime64[Y]" + assert yearonly_date.weekday is None + def test_substract(self): # date - date = timedelta date_difference = Date(2024, 1, 2) - Date(2024, 1, 1) diff --git a/tests/test_undate.py b/tests/test_undate.py index 18e03b0..16ea08c 100644 --- a/tests/test_undate.py +++ b/tests/test_undate.py @@ -130,6 +130,16 @@ def test_calendar(self): with pytest.raises(ValueError, match="Calendar `foobar` is not supported"): Undate(848, calendar="foobar") + def test_as_calendar(self): + # changes calendar *without* converting dates + assert Undate(1243, 5, 7).as_calendar(Calendar.ISLAMIC) == Undate( + 1243, 5, 7, calendar=Calendar.ISLAMIC + ) + # should also work with string + assert Undate(1243, 5, 7).as_calendar("islamic") == Undate( + 1243, 5, 7, calendar=Calendar.ISLAMIC + ) + def test_init_invalid(self): with pytest.raises(ValueError): Undate("19??") @@ -298,11 +308,17 @@ def test_lt_notimplemented(self): # how to compare mixed precision where dates overlap? # if the second date falls *within* earliest/latest, # then it is not clearly less; not implemented? - with pytest.raises(NotImplementedError, match="date falls within the other"): + with pytest.raises( + NotImplementedError, + match="one date \\(2022-05\\) falls within the other \\(2022\\)", + ): assert Undate(2022) < Undate(2022, 5) # same if we attempt to compare in the other direction - with pytest.raises(NotImplementedError, match="date falls within the other"): + with pytest.raises( + NotImplementedError, + match="one date \\(2022-05\\) falls within the other \\(2022\\)", + ): assert Undate(2022, 5) < Undate(2022) testdata_contains = [