diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index 31e01ed..3fdbd7c 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -33,26 +33,25 @@ jobs:
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python }}
+ cache: 'pip'
+ cache-dependency-path: '**/pyproject.toml'
- - name: Install uv
- uses: astral-sh/setup-uv@v5
- with:
- enable-cache: true
- cache-dependency-glob: "pyproject.toml"
-
- - name: Install package with dev and test dependencies
- run: uv sync --extra test
+ - name: Install package with dependencies
+ run: pip install -e ".[test]"
# for all versions but the one we use for code coverage, run normally
- - name: Run unit tests normally
- run: uv run pytest
+ - name: Run unit tests without code coverage
+ run: pytest
if: ${{ matrix.python != env.COV_PYTHON_VERSION }}
# run code coverage in one version only
- name: Run unit tests with code coverage reporting
- run: uv run pytest --cov=undate
+ run: pytest --cov=.
if: ${{ matrix.python == env.COV_PYTHON_VERSION }}
- - name: Upload test coverage to Codecov
- uses: codecov/codecov-action@v3
+
+ - name: Upload coverage to Codecov
+ uses: codecov/codecov-action@v4
+ env:
+ CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
if: ${{ matrix.python == env.COV_PYTHON_VERSION }}
diff --git a/examples/pgp_dates.ipynb b/examples/pgp_dates.ipynb
new file mode 100644
index 0000000..43a858c
--- /dev/null
+++ b/examples/pgp_dates.ipynb
@@ -0,0 +1,4406 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "2d231f1e-3944-4579-b868-504f7fb2d543",
+ "metadata": {},
+ "source": [
+ "# Princeton Geniza Project\n",
+ "\n",
+ "This notebook demonstrates parsing dates from non-Gregorian calendars and working with mixed-calendar dates.\n",
+ "\n",
+ "This notebook uses document data from the [Princeton Geniza Project](https://geniza.princeton.edu/), which is a database of fragmentary medieval documents found in the Cairo Geniza. Documents are written largely in Hebrew script in Hebrew and Arabic languages, and use a range of calendars including: \n",
+ "- Hebrew _Anno Mundi_\n",
+ "- Islamic _Hijri_\n",
+ "- Hebrew Seleucid calendar (_Anno Mundi_ calendar with a 3449 year offset)\n",
+ "\n",
+ "The dataset includes original dates and standardized Common Era dates (Julian before 1583, Gregorian after).\n",
+ "\n",
+ "This notebook uses the data published on GitHub at https://github.com/princetongenizalab/pgp-metadata\n",
+ "\n",
+ "\n",
+ "*Notebook authored by Rebecca Sutton Koeser, 2025.*"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9d9da1cf-6cc6-4b6a-9baf-782152998d82",
+ "metadata": {},
+ "source": [
+ "## Load and filter data\n",
+ "\n",
+ "Limit to documents with authoritative \"date on document\" set in the metadata."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "67c5532d-ebc4-4e1e-aa64-e6802ed1d971",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "\n",
+ "pgp_documents_csv = \"https://github.com/princetongenizalab/pgp-metadata/raw/main/data/documents.csv\"\n",
+ "documents = pd.read_csv(pgp_documents_csv)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "41dc5a05-a04b-4b6d-acfe-1f7b04849346",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Total documents: 35,187\n",
+ "Documents with dates: 4,451\n",
+ " date on document: 4,126\n",
+ " inferred dating: 331\n"
+ ]
+ }
+ ],
+ "source": [
+ "# limit to documents with dates\n",
+ "docs_with_dates = documents[documents.doc_date_standard.notna() | documents.inferred_date_standard.notna()]\n",
+ "docs_with_docdate = documents[documents.doc_date_standard.notna()].copy()\n",
+ "docs_with_inferreddate = documents[documents.inferred_date_standard.notna()]\n",
+ "\n",
+ "print(f\"\"\"\n",
+ "Total documents: {len(documents):,}\n",
+ "Documents with dates: {len(docs_with_dates):,}\n",
+ " date on document: {len(docs_with_docdate):,}\n",
+ " inferred dating: {len(docs_with_inferreddate):,}\"\"\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "94d6340b-10d0-461b-b745-378ffa1ffcec",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pgpid | \n",
+ " doc_date_original | \n",
+ " doc_date_calendar | \n",
+ " doc_date_standard | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 5 | \n",
+ " 449 | \n",
+ " 1570 | \n",
+ " Seleucid | \n",
+ " 1259 | \n",
+ "
\n",
+ " \n",
+ " | 16 | \n",
+ " 463 | \n",
+ " 19 Adar 1427 | \n",
+ " Seleucid | \n",
+ " 1116-03-05 | \n",
+ "
\n",
+ " \n",
+ " | 17 | \n",
+ " 464 | \n",
+ " Tammuz 1288 | \n",
+ " Seleucid | \n",
+ " 0977-06-21/0977-07-19 | \n",
+ "
\n",
+ " \n",
+ " | 23 | \n",
+ " 472 | \n",
+ " 1337 | \n",
+ " Seleucid | \n",
+ " 1025-08-28/1026-09-14 | \n",
+ "
\n",
+ " \n",
+ " | 36 | \n",
+ " 491 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1131 | \n",
+ "
\n",
+ " \n",
+ " | 41 | \n",
+ " 499 | \n",
+ " Wednesday, 15 Kislev 1500 | \n",
+ " Seleucid | \n",
+ " 1188-12-07 | \n",
+ "
\n",
+ " \n",
+ " | 43 | \n",
+ " 502 | \n",
+ " Tevet 1548 | \n",
+ " Seleucid | \n",
+ " 1236-11-30/1236-12-28 | \n",
+ "
\n",
+ " \n",
+ " | 47 | \n",
+ " 506 | \n",
+ " Elul 1428 | \n",
+ " Seleucid | \n",
+ " 1117-08-01/1117-08-29 | \n",
+ "
\n",
+ " \n",
+ " | 55 | \n",
+ " 516 | \n",
+ " First decade of Ḥeshvan 1442 | \n",
+ " Seleucid | \n",
+ " 1130-10-06/1130-10-15 | \n",
+ "
\n",
+ " \n",
+ " | 61 | \n",
+ " 524 | \n",
+ " Thursday, 12 Sivan 4795 | \n",
+ " Anno Mundi | \n",
+ " 1035-05-22 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pgpid doc_date_original doc_date_calendar \\\n",
+ "5 449 1570 Seleucid \n",
+ "16 463 19 Adar 1427 Seleucid \n",
+ "17 464 Tammuz 1288 Seleucid \n",
+ "23 472 1337 Seleucid \n",
+ "36 491 NaN NaN \n",
+ "41 499 Wednesday, 15 Kislev 1500 Seleucid \n",
+ "43 502 Tevet 1548 Seleucid \n",
+ "47 506 Elul 1428 Seleucid \n",
+ "55 516 First decade of Ḥeshvan 1442 Seleucid \n",
+ "61 524 Thursday, 12 Sivan 4795 Anno Mundi \n",
+ "\n",
+ " doc_date_standard \n",
+ "5 1259 \n",
+ "16 1116-03-05 \n",
+ "17 0977-06-21/0977-07-19 \n",
+ "23 1025-08-28/1026-09-14 \n",
+ "36 1131 \n",
+ "41 1188-12-07 \n",
+ "43 1236-11-30/1236-12-28 \n",
+ "47 1117-08-01/1117-08-29 \n",
+ "55 1130-10-06/1130-10-15 \n",
+ "61 1035-05-22 "
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "docs_with_docdate[['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard']].head(10)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4df9e446-1f9c-4585-9557-3164cc8ce01f",
+ "metadata": {},
+ "source": [
+ "## Parse dates (standard and original)\n",
+ "\n",
+ "Parse the standardized date (Julian/Gregorian) as EDTF; in some cases this may fail due to invalid user-entered data."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "b9703b47-a7e2-4178-a7da-fb47db11b5b7",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Parse error on 1217-02-20/1217-02-29: Error trying to process rule \"date\":\n",
+ "\n",
+ "Day out of range in datetime string \"1217-02-29\"\n",
+ "Parse error on 1747-02-29: Error trying to process rule \"date\":\n",
+ "\n",
+ "Day out of range in datetime string \"1747-02-29\"\n"
+ ]
+ }
+ ],
+ "source": [
+ "from lark.visitors import VisitError\n",
+ "\n",
+ "# first, how far can we get with the standard dates? can we parse as edtf and sort, render?\n",
+ "from undate import Undate \n",
+ "\n",
+ "def parse_standard_date(value):\n",
+ " try:\n",
+ " return Undate.parse(value, \"EDTF\")\n",
+ " except VisitError as err:\n",
+ " print(f\"Parse error on {value}: {err}\")\n",
+ " \n",
+ "\n",
+ "# ignore gregorian/julian distinction for now\n",
+ "# from pgp code:\n",
+ "# Julian Thursday, 4 October 1582, being followed by Gregorian Friday, 15 October\n",
+ "# cut off between gregorian/julian dates, in julian days\n",
+ "#gregorian_start_jd = convertdate.julianday.from_julian(1582, 10, 5)\n",
+ "\n",
+ "docs_with_docdate['undate_standard'] = docs_with_docdate.doc_date_standard.apply(parse_standard_date)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "f49e82a4-b05b-4395-998f-0c9e75729e9f",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pgpid | \n",
+ " doc_date_original | \n",
+ " doc_date_calendar | \n",
+ " doc_date_standard | \n",
+ " last_modified | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 3190 | \n",
+ " 3957 | \n",
+ " middle decade of Adar 1528 | \n",
+ " Seleucid | \n",
+ " 1217-02-20/1217-02-29 | \n",
+ " 2025-04-12 20:45:36.603800+00:00 | \n",
+ "
\n",
+ " \n",
+ " | 34437 | \n",
+ " 40006 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1747-02-29 | \n",
+ " 2024-08-07 18:24:19.425288+00:00 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pgpid doc_date_original doc_date_calendar \\\n",
+ "3190 3957 middle decade of Adar 1528 Seleucid \n",
+ "34437 40006 NaN NaN \n",
+ "\n",
+ " doc_date_standard last_modified \n",
+ "3190 1217-02-20/1217-02-29 2025-04-12 20:45:36.603800+00:00 \n",
+ "34437 1747-02-29 2024-08-07 18:24:19.425288+00:00 "
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# what are the records with standardized dates that couldn't be parsed?\n",
+ "\n",
+ "# this is probably a data error in the original\n",
+ "\n",
+ "docs_with_docdate[docs_with_docdate.undate_standard.isna()][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard', 'last_modified']]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3632e7f2-aae9-4136-9bb0-32789de34c4e",
+ "metadata": {},
+ "source": [
+ "What calendars are used by documents with original dates?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "2d502575-a2b4-4fce-9f59-6932275dfac2",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "doc_date_calendar\n",
+ "Seleucid 1604\n",
+ "Anno Mundi 1147\n",
+ "Hijrī 884\n",
+ "Kharājī 8\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "docs_with_docdate.doc_date_calendar.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "04e4ffb2-13e7-49cc-913b-2104b61aef16",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pgpid | \n",
+ " doc_date_original | \n",
+ " doc_date_calendar | \n",
+ " doc_date_standard | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 61 | \n",
+ " 524 | \n",
+ " Thursday, 12 Sivan 4795 | \n",
+ " Anno Mundi | \n",
+ " 1035-05-22 | \n",
+ "
\n",
+ " \n",
+ " | 90 | \n",
+ " 561 | \n",
+ " 10 Nisan 4716 | \n",
+ " Anno Mundi | \n",
+ " 0956-03-24 | \n",
+ "
\n",
+ " \n",
+ " | 111 | \n",
+ " 582 | \n",
+ " Thursday, 6 Adar 4996 | \n",
+ " Anno Mundi | \n",
+ " 1236-02-14 | \n",
+ "
\n",
+ " \n",
+ " | 119 | \n",
+ " 591 | \n",
+ " Sunday, 29 Tammuz 4898 | \n",
+ " Anno Mundi | \n",
+ " 1138-07-10 | \n",
+ "
\n",
+ " \n",
+ " | 131 | \n",
+ " 603 | \n",
+ " 4805/4806 | \n",
+ " Anno Mundi | \n",
+ " 1044-08-27/1045-09-13 | \n",
+ "
\n",
+ " \n",
+ " | 177 | \n",
+ " 660 | \n",
+ " 22 Sivan 4974 | \n",
+ " Anno Mundi | \n",
+ " 1214-06-01 | \n",
+ "
\n",
+ " \n",
+ " | 207 | \n",
+ " 695 | \n",
+ " Friday, [25] Nisan [4810] | \n",
+ " Anno Mundi | \n",
+ " 1050-04-20 | \n",
+ "
\n",
+ " \n",
+ " | 215 | \n",
+ " 703 | \n",
+ " 8 Elul (4)811 | \n",
+ " Anno Mundi | \n",
+ " 1051-08-18 | \n",
+ "
\n",
+ " \n",
+ " | 255 | \n",
+ " 750 | \n",
+ " Friday, 24 Ḥeshvan 4765 | \n",
+ " Anno Mundi | \n",
+ " 1004-11-10 | \n",
+ "
\n",
+ " \n",
+ " | 264 | \n",
+ " 760 | \n",
+ " Thursday, 11 Av 4783 | \n",
+ " Anno Mundi | \n",
+ " 1023-08-01 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pgpid doc_date_original doc_date_calendar doc_date_standard\n",
+ "61 524 Thursday, 12 Sivan 4795 Anno Mundi 1035-05-22\n",
+ "90 561 10 Nisan 4716 Anno Mundi 0956-03-24\n",
+ "111 582 Thursday, 6 Adar 4996 Anno Mundi 1236-02-14\n",
+ "119 591 Sunday, 29 Tammuz 4898 Anno Mundi 1138-07-10\n",
+ "131 603 4805/4806 Anno Mundi 1044-08-27/1045-09-13\n",
+ "177 660 22 Sivan 4974 Anno Mundi 1214-06-01\n",
+ "207 695 Friday, [25] Nisan [4810] Anno Mundi 1050-04-20\n",
+ "215 703 8 Elul (4)811 Anno Mundi 1051-08-18\n",
+ "255 750 Friday, 24 Ḥeshvan 4765 Anno Mundi 1004-11-10\n",
+ "264 760 Thursday, 11 Av 4783 Anno Mundi 1023-08-01"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# example hebrew dates\n",
+ "docs_with_docdate[docs_with_docdate.doc_date_calendar == \"Anno Mundi\"][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard']].head(10)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "101b8194-35b3-4e7e-b3e4-68dfec2e932c",
+ "metadata": {},
+ "source": [
+ "### Inspect variations in the data that may cause problems for parsing\n",
+ "\n",
+ "There are some ideosyncrasies with the original dates, since some of them were entered before the PGPv4 system supported built-in conversion.\n",
+ "\n",
+ "- calendar abbreviation included in the date string (i.e., AM, AH for _Anno Mundi_, _Anno Hegirae_ respectively)\n",
+ "- brackets for inferred digits or unknown digits (e.g., `152[.]` or `[4]82[.]`)\n",
+ "- ordinals instead of numerals for the day of the month (e.g., \"11th Tammuz 4767\" or \"Monday, 27th Ṭevet 4797\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "4d11e583-7c80-44ed-80b1-d0c5b7b7f408",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/var/folders/mb/6qm4h4yx3yqdy2bv2sjyp4z00000gp/T/ipykernel_38072/1200615794.py:2: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n",
+ " hebrew_dates = docs_with_docdate[docs_with_docdate.doc_date_calendar == \"Anno Mundi\"][docs_with_docdate.doc_date_original.notna()]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pgpid | \n",
+ " doc_date_original | \n",
+ " doc_date_calendar | \n",
+ " doc_date_standard | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 702 | \n",
+ " 1223 | \n",
+ " Wednesday, 9 Tammuz 4912 AM | \n",
+ " Anno Mundi | \n",
+ " 1152-06-13 | \n",
+ "
\n",
+ " \n",
+ " | 16698 | \n",
+ " 19975 | \n",
+ " Sunday, 10 Kislev 5583 AM | \n",
+ " Anno Mundi | \n",
+ " 1822-11-24 | \n",
+ "
\n",
+ " \n",
+ " | 25415 | \n",
+ " 30550 | \n",
+ " Tammuz 5537 AM | \n",
+ " Anno Mundi | \n",
+ " 1777-07-06/1777-08-03 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pgpid doc_date_original doc_date_calendar \\\n",
+ "702 1223 Wednesday, 9 Tammuz 4912 AM Anno Mundi \n",
+ "16698 19975 Sunday, 10 Kislev 5583 AM Anno Mundi \n",
+ "25415 30550 Tammuz 5537 AM Anno Mundi \n",
+ "\n",
+ " doc_date_standard \n",
+ "702 1152-06-13 \n",
+ "16698 1822-11-24 \n",
+ "25415 1777-07-06/1777-08-03 "
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# how many end with AM ?\n",
+ "hebrew_dates = docs_with_docdate[docs_with_docdate.doc_date_calendar == \"Anno Mundi\"][docs_with_docdate.doc_date_original.notna()]\n",
+ "hebrew_dates[hebrew_dates.doc_date_original.str.endswith(\"AM\")][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard']]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "cd1a751a-5299-418f-a3f8-050ab0384354",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pgpid | \n",
+ " doc_date_original | \n",
+ " doc_date_calendar | \n",
+ " doc_date_standard | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 1556 | \n",
+ " 2163 | \n",
+ " first third of Tammuz 500[.] | \n",
+ " Anno Mundi | \n",
+ " 1244/1249 | \n",
+ "
\n",
+ " \n",
+ " | 1567 | \n",
+ " 2175 | \n",
+ " End of Sivan 152[.] | \n",
+ " Seleucid | \n",
+ " 1209/1218 | \n",
+ "
\n",
+ " \n",
+ " | 1753 | \n",
+ " 2460 | \n",
+ " 13[..] | \n",
+ " Seleucid | \n",
+ " 988/1088 | \n",
+ "
\n",
+ " \n",
+ " | 2018 | \n",
+ " 2745 | \n",
+ " 1[.] Kislev 48[..] | \n",
+ " Anno Mundi | \n",
+ " 1039-11-30/1138-11-24 | \n",
+ "
\n",
+ " \n",
+ " | 3044 | \n",
+ " 3805 | \n",
+ " 13[..] | \n",
+ " Seleucid | \n",
+ " 988/1087 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 30589 | \n",
+ " 35955 | \n",
+ " 12 Muḥarram 52[.] | \n",
+ " Hijrī | \n",
+ " 1126/1134 | \n",
+ "
\n",
+ " \n",
+ " | 31226 | \n",
+ " 36738 | \n",
+ " 54[.] | \n",
+ " Hijrī | \n",
+ " 1145/1154 | \n",
+ "
\n",
+ " \n",
+ " | 32548 | \n",
+ " 38077 | \n",
+ " 14[...] | \n",
+ " Seleucid | \n",
+ " 1088-09-19/1188-09-23 | \n",
+ "
\n",
+ " \n",
+ " | 34652 | \n",
+ " 40226 | \n",
+ " 49[.] | \n",
+ " Hijrī | \n",
+ " 1096-12-19/1106-09-01 | \n",
+ "
\n",
+ " \n",
+ " | 34760 | \n",
+ " 40335 | \n",
+ " [4]82[.] | \n",
+ " Anno Mundi | \n",
+ " 1059-09-11/1069-09-18 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
66 rows × 4 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pgpid doc_date_original doc_date_calendar \\\n",
+ "1556 2163 first third of Tammuz 500[.] Anno Mundi \n",
+ "1567 2175 End of Sivan 152[.] Seleucid \n",
+ "1753 2460 13[..] Seleucid \n",
+ "2018 2745 1[.] Kislev 48[..] Anno Mundi \n",
+ "3044 3805 13[..] Seleucid \n",
+ "... ... ... ... \n",
+ "30589 35955 12 Muḥarram 52[.] Hijrī \n",
+ "31226 36738 54[.] Hijrī \n",
+ "32548 38077 14[...] Seleucid \n",
+ "34652 40226 49[.] Hijrī \n",
+ "34760 40335 [4]82[.] Anno Mundi \n",
+ "\n",
+ " doc_date_standard \n",
+ "1556 1244/1249 \n",
+ "1567 1209/1218 \n",
+ "1753 988/1088 \n",
+ "2018 1039-11-30/1138-11-24 \n",
+ "3044 988/1087 \n",
+ "... ... \n",
+ "30589 1126/1134 \n",
+ "31226 1145/1154 \n",
+ "32548 1088-09-19/1188-09-23 \n",
+ "34652 1096-12-19/1106-09-01 \n",
+ "34760 1059-09-11/1069-09-18 \n",
+ "\n",
+ "[66 rows x 4 columns]"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# how many include periods?\n",
+ "docs_with_docdate[docs_with_docdate.doc_date_original.notna() & docs_with_docdate.doc_date_original.str.contains(\"\\\\.\")][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard']]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "9fa8d2ba-6612-4de5-8741-dea177f99412",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pgpid | \n",
+ " doc_date_original | \n",
+ " doc_date_calendar | \n",
+ " doc_date_standard | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 635 | \n",
+ " 1154 | \n",
+ " Last decade of Kislev 5004 | \n",
+ " Anno Mundi | \n",
+ " 1243-12 | \n",
+ "
\n",
+ " \n",
+ " | 1172 | \n",
+ " 1750 | \n",
+ " 11th Tammuz 4767 | \n",
+ " Anno Mundi | \n",
+ " 1007 | \n",
+ "
\n",
+ " \n",
+ " | 1173 | \n",
+ " 1751 | \n",
+ " Monday, 27th Ṭevet 4797 | \n",
+ " Anno Mundi | \n",
+ " 1037-01-23 | \n",
+ "
\n",
+ " \n",
+ " | 1556 | \n",
+ " 2163 | \n",
+ " first third of Tammuz 500[.] | \n",
+ " Anno Mundi | \n",
+ " 1244/1249 | \n",
+ "
\n",
+ " \n",
+ " | 5142 | \n",
+ " 6795 | \n",
+ " last decade of Tishrei 4991 | \n",
+ " Anno Mundi | \n",
+ " 1230-09-29/1230-10-08 | \n",
+ "
\n",
+ " \n",
+ " | 5223 | \n",
+ " 6892 | \n",
+ " last decade of Iyyar 4906 | \n",
+ " Anno Mundi | \n",
+ " 1146-05-04/1146-05-13 | \n",
+ "
\n",
+ " \n",
+ " | 5664 | \n",
+ " 7409 | \n",
+ " last third of Ḥeshvan 4965 | \n",
+ " Anno Mundi | \n",
+ " 1204-10-17/1204-10-25 | \n",
+ "
\n",
+ " \n",
+ " | 5812 | \n",
+ " 7581 | \n",
+ " middle third of Adar 4876 | \n",
+ " Anno Mundi | \n",
+ " 1116-05 | \n",
+ "
\n",
+ " \n",
+ " | 7024 | \n",
+ " 9068 | \n",
+ " Last decade of Ṭevet 4898 | \n",
+ " Anno Mundi | \n",
+ " 1138-01 | \n",
+ "
\n",
+ " \n",
+ " | 8638 | \n",
+ " 11215 | \n",
+ " Middle third of Av 4889 | \n",
+ " Anno Mundi | \n",
+ " 1129-07-29/1129-08-07 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pgpid doc_date_original doc_date_calendar \\\n",
+ "635 1154 Last decade of Kislev 5004 Anno Mundi \n",
+ "1172 1750 11th Tammuz 4767 Anno Mundi \n",
+ "1173 1751 Monday, 27th Ṭevet 4797 Anno Mundi \n",
+ "1556 2163 first third of Tammuz 500[.] Anno Mundi \n",
+ "5142 6795 last decade of Tishrei 4991 Anno Mundi \n",
+ "5223 6892 last decade of Iyyar 4906 Anno Mundi \n",
+ "5664 7409 last third of Ḥeshvan 4965 Anno Mundi \n",
+ "5812 7581 middle third of Adar 4876 Anno Mundi \n",
+ "7024 9068 Last decade of Ṭevet 4898 Anno Mundi \n",
+ "8638 11215 Middle third of Av 4889 Anno Mundi \n",
+ "\n",
+ " doc_date_standard \n",
+ "635 1243-12 \n",
+ "1172 1007 \n",
+ "1173 1037-01-23 \n",
+ "1556 1244/1249 \n",
+ "5142 1230-09-29/1230-10-08 \n",
+ "5223 1146-05-04/1146-05-13 \n",
+ "5664 1204-10-17/1204-10-25 \n",
+ "5812 1116-05 \n",
+ "7024 1138-01 \n",
+ "8638 1129-07-29/1129-08-07 "
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# how many use ordinals instead of numerals?\n",
+ "hebrew_dates[hebrew_dates.doc_date_original.str.contains(\"st\") | hebrew_dates.doc_date_original.str.contains(\"rd\") | hebrew_dates.doc_date_original.str.contains(\"th\")][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard']].head(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "5b6d5811-fe81-471d-bd29-896cec4c98ff",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "11th Tammuz 4767: 11 Tammuz 4767\n",
+ "27th Tevet: 27 Tevet\n",
+ "8th Kislev: 8 Kislev\n"
+ ]
+ }
+ ],
+ "source": [
+ "import re\n",
+ "\n",
+ "def remove_ordinals(val):\n",
+ " return re.sub(r'(\\d+)(st|nd|rd|th)', \"\\\\1\", val)\n",
+ "\n",
+ "# test removing ordinals without removing the numbers\n",
+ "for val in ['11th Tammuz 4767', \"27th Tevet\", \"8th Kislev\"]:\n",
+ " print(f\"{val}: { remove_ordinals(val)}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7b0347b7-954b-4d2e-ad95-44dc2e24ac01",
+ "metadata": {},
+ "source": [
+ "Since this dataset has a mix of calendars and has known inconsistencies that may need cleaning,\n",
+ "we define a custom parsing method that selects the appropriate calendar and simplifies date portions that are not currently supported by the undate parsers."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "798da8f2-2332-48c2-aeec-214474e9d49c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# parse hijri, anno mundi, and seleucid dates as undates\n",
+ "\n",
+ "from lark.exceptions import UnexpectedEOF\n",
+ "\n",
+ "# set this to True to see details about parsing\n",
+ "VERBOSE_PARSE_OUTPUT = False \n",
+ "\n",
+ "\n",
+ "def parse_original_date(row):\n",
+ " # print(f\"PGPID {row.pgpid} {row.doc_date_original} ({row.doc_date_calendar})\")\n",
+ " undate_calendar = None\n",
+ " if row.doc_date_calendar == \"Anno Mundi\":\n",
+ " undate_calendar = \"Hebrew\"\n",
+ " elif row.doc_date_calendar == \"Hijrī\":\n",
+ " undate_calendar = \"Islamic\"\n",
+ " elif row.doc_date_calendar == \"Seleucid\":\n",
+ " # handle seleucid as hebrew with offset (adapt from pgp code)\n",
+ " undate_calendar = \"Seleucid\"\n",
+ "\n",
+ " \n",
+ " if undate_calendar:\n",
+ " value = row.doc_date_original\n",
+ "\n",
+ " # some dates have unknown digits, e.g. 1[.] Kislev 48[..] or 152[.]\n",
+ " # ... the calendar parser don't support this, even though Undate does support unknown digits\n",
+ " # in future, perhaps we can add missing digit logic with this syntax to share across appropriate parsers\n",
+ " if '[.' in value:\n",
+ " if VERBOSE_PARSE_OUTPUT:\n",
+ " print(f\"ignoring missing digits for now {value}\")\n",
+ " value = value.replace(\"[.]\", \"0\").replace(\"[..]\", \"00\").replace(\"[...]\", \"000\") \n",
+ " \n",
+ " # some dates have inferred numbers, e.g. Friday, [25] Nisan [4810] or 8 Elul (4)811'\n",
+ " # for now, just strip out brackets before parsing; \n",
+ " # in future, could potentially infer uncertainty based on these\n",
+ " value = value.replace('[', '').replace(']', '').replace('(', '').replace(')', '')\n",
+ "\n",
+ " # for now, remove modifiers that are not supported by undate parser:\n",
+ " # Late Tevet 4903, Last decade of Kislev 5004, first third of ...\n",
+ " # some dates include of, e.g. day of month\n",
+ " modifiers = [\"Late \", \"(first|middle|last)( third|half|decade|tenth)? (of )?\", \"(Beginning|end) of \", \"last day\", \"First 10 days\", \" of\", \"spring\", \"decade \", \"night, \"]\n",
+ " for mod in modifiers:\n",
+ " value = re.sub(mod, \"\", value, flags=re.I)\n",
+ "\n",
+ " # there are a handful of misspelled wednesdays...\n",
+ " value = value.replace(\"Wedensday\", \"Wednesday\")\n",
+ " # and a Thrusday\n",
+ " value = value.replace(\"Thrusday\", \"Thursday\")\n",
+ "\n",
+ " # three Hebrew calendar dates include text \"AM\" at end; at least one AH date\n",
+ " if value.endswith(\" AM\") or value.endswith(\" AH\"):\n",
+ " value = value[:-3]\n",
+ " if value.endswith(\".\"): # strip off trailing period\n",
+ " value = value[:-1]\n",
+ "\n",
+ " # about 62 have ordinals; strip them out\n",
+ " value = remove_ordinals(value)\n",
+ " \n",
+ " try:\n",
+ " return Undate.parse(value, undate_calendar)\n",
+ " except (VisitError, ValueError, UnexpectedEOF) as err:\n",
+ " if VERBOSE_PARSE_OUTPUT:\n",
+ " print(f\"Parse error on PGPID {row.pgpid} {value} ({undate_calendar}): {err}\")\n",
+ "\n",
+ " # there are a handful of cases in PGP where calendars are mixed,\n",
+ " # i.e. hebrew months used for hijri calendar\n",
+ "\n",
+ " # some dates are entered in ISO format for another calendar; can we parse and set calendar?\n",
+ " if \"-\" in value and \"/\" not in value: # exclude intervals for now\n",
+ " try:\n",
+ " parsed = Undate.parse(value, \"ISO8601\")\n",
+ " if parsed:\n",
+ " parsed = parsed.as_calendar(undate_calendar)\n",
+ " if VERBOSE_PARSE_OUTPUT:\n",
+ " print(f\"parsed {value} with ISO8601 format and calendar {undate_calendar}, result is {parsed} ({parsed.earliest}/{parsed.latest})\")\n",
+ " return parsed\n",
+ " except ValueError as err:\n",
+ " if VERBOSE_PARSE_OUTPUT:\n",
+ " print(f\"Could not parse {value} as ISO date: {err}\")\n",
+ "\n",
+ "docs_with_docdate['undate_orig'] = docs_with_docdate.apply(parse_original_date, axis=1)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6b2bfb96-2d8b-4f09-a9a6-c2534273d503",
+ "metadata": {},
+ "source": [
+ "### Review parsing results \n",
+ "\n",
+ "How many of the dates in supported calendars were parsed?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "623eb160-ab6c-44ba-b3f4-6770c2c7bd86",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "original dates parsed: 3462\n",
+ "original dates unparsed: 173 (anno mundi, hijri, and seleucid calendars)\n",
+ "proportion parsed: 95.24%\n"
+ ]
+ }
+ ],
+ "source": [
+ "orig_dates_parsed = docs_with_docdate[docs_with_docdate.undate_orig.notna()].copy()\n",
+ "orig_dates_unparsed = docs_with_docdate[docs_with_docdate.doc_date_original.notna() & docs_with_docdate.doc_date_calendar.isin(['Anno Mundi', 'Hijrī', 'Seleucid']) & docs_with_docdate.undate_orig.isna()] \n",
+ "\n",
+ "total_parsed = len(orig_dates_parsed)\n",
+ "total_unparsed = len(orig_dates_unparsed)\n",
+ "print(f\"\"\"original dates parsed: {total_parsed}\n",
+ "original dates unparsed: {total_unparsed} (anno mundi, hijri, and seleucid calendars)\n",
+ "proportion parsed: {(total_parsed/(total_parsed + total_unparsed))*100:0.2f}%\"\"\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ae5b3cfa-ed25-4a3d-ae78-c7590543ba20",
+ "metadata": {},
+ "source": [
+ "What is the date granularity of the dates that were parsed?\n",
+ "\n",
+ "Note that these results are skewed somewhat due to the modifiers and uncertainty that we are simplifying in order to parse the dates."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "42945787-6788-422d-9a04-f983ec6b31af",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pgpid | \n",
+ " doc_date_original | \n",
+ " doc_date_calendar | \n",
+ " doc_date_standard | \n",
+ " undate_standard | \n",
+ " undate_orig | \n",
+ " orig_date_precision | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 5 | \n",
+ " 449 | \n",
+ " 1570 | \n",
+ " Seleucid | \n",
+ " 1259 | \n",
+ " 1259 | \n",
+ " 1570 | \n",
+ " year | \n",
+ "
\n",
+ " \n",
+ " | 16 | \n",
+ " 463 | \n",
+ " 19 Adar 1427 | \n",
+ " Seleucid | \n",
+ " 1116-03-05 | \n",
+ " 1116-03-05 | \n",
+ " 1427-12-19 | \n",
+ " day | \n",
+ "
\n",
+ " \n",
+ " | 17 | \n",
+ " 464 | \n",
+ " Tammuz 1288 | \n",
+ " Seleucid | \n",
+ " 0977-06-21/0977-07-19 | \n",
+ " 0977-06-21/0977-07-19 | \n",
+ " 1288-04 | \n",
+ " month | \n",
+ "
\n",
+ " \n",
+ " | 23 | \n",
+ " 472 | \n",
+ " 1337 | \n",
+ " Seleucid | \n",
+ " 1025-08-28/1026-09-14 | \n",
+ " 1025-08-28/1026-09-14 | \n",
+ " 1337 | \n",
+ " year | \n",
+ "
\n",
+ " \n",
+ " | 41 | \n",
+ " 499 | \n",
+ " Wednesday, 15 Kislev 1500 | \n",
+ " Seleucid | \n",
+ " 1188-12-07 | \n",
+ " 1188-12-07 | \n",
+ " 1500-09-15 | \n",
+ " day | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pgpid doc_date_original doc_date_calendar doc_date_standard \\\n",
+ "5 449 1570 Seleucid 1259 \n",
+ "16 463 19 Adar 1427 Seleucid 1116-03-05 \n",
+ "17 464 Tammuz 1288 Seleucid 0977-06-21/0977-07-19 \n",
+ "23 472 1337 Seleucid 1025-08-28/1026-09-14 \n",
+ "41 499 Wednesday, 15 Kislev 1500 Seleucid 1188-12-07 \n",
+ "\n",
+ " undate_standard undate_orig orig_date_precision \n",
+ "5 1259 1570 year \n",
+ "16 1116-03-05 1427-12-19 day \n",
+ "17 0977-06-21/0977-07-19 1288-04 month \n",
+ "23 1025-08-28/1026-09-14 1337 year \n",
+ "41 1188-12-07 1500-09-15 day "
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# determine original date precision based on parsed undate\n",
+ "orig_dates_parsed['orig_date_precision'] = orig_dates_parsed.undate_orig.apply(lambda x: str(x.precision).lower())\n",
+ "orig_dates_parsed[['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard', 'undate_standard', 'undate_orig', 'orig_date_precision']].head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "88f1d3ab-e1c7-48b5-8907-5aeea463f1e8",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "orig_date_precision\n",
+ "day 1599\n",
+ "month 1027\n",
+ "year 836\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# this is skewed because of the kinds of dates we're not able to parse or modifiers we're omitting entirely\n",
+ "orig_dates_parsed.orig_date_precision.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3fec8455-1830-48b5-961c-4ae0847bf63c",
+ "metadata": {},
+ "source": [
+ "Check on the Seleucid date parsing by comparing undate calendar conversion with the standardized CE date included in the dataset.\n",
+ "\n",
+ "We expect `undate` dates before 1583 to be off by about ~ 10 days since we did not adjust for Julian calendar."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "id": "5d3a55b0-ed36-47ba-b022-848bb128b449",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pgpid | \n",
+ " doc_date_original | \n",
+ " doc_date_calendar | \n",
+ " undate_orig | \n",
+ " orig_date_precision | \n",
+ " doc_date_standard | \n",
+ " undate_earliest | \n",
+ " undate_latest | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 5 | \n",
+ " 449 | \n",
+ " 1570 | \n",
+ " Seleucid | \n",
+ " 1570 | \n",
+ " year | \n",
+ " 1259 | \n",
+ " 1258-09-07 | \n",
+ " 1259-09-26 | \n",
+ "
\n",
+ " \n",
+ " | 16 | \n",
+ " 463 | \n",
+ " 19 Adar 1427 | \n",
+ " Seleucid | \n",
+ " 1427-12-19 | \n",
+ " day | \n",
+ " 1116-03-05 | \n",
+ " 1116-03-12 | \n",
+ " 1116-03-12 | \n",
+ "
\n",
+ " \n",
+ " | 17 | \n",
+ " 464 | \n",
+ " Tammuz 1288 | \n",
+ " Seleucid | \n",
+ " 1288-04 | \n",
+ " month | \n",
+ " 0977-06-21/0977-07-19 | \n",
+ " 0977-06-26 | \n",
+ " 0977-07-24 | \n",
+ "
\n",
+ " \n",
+ " | 23 | \n",
+ " 472 | \n",
+ " 1337 | \n",
+ " Seleucid | \n",
+ " 1337 | \n",
+ " year | \n",
+ " 1025-08-28/1026-09-14 | \n",
+ " 1025-09-03 | \n",
+ " 1026-09-20 | \n",
+ "
\n",
+ " \n",
+ " | 41 | \n",
+ " 499 | \n",
+ " Wednesday, 15 Kislev 1500 | \n",
+ " Seleucid | \n",
+ " 1500-09-15 | \n",
+ " day | \n",
+ " 1188-12-07 | \n",
+ " 1188-12-14 | \n",
+ " 1188-12-14 | \n",
+ "
\n",
+ " \n",
+ " | 43 | \n",
+ " 502 | \n",
+ " Tevet 1548 | \n",
+ " Seleucid | \n",
+ " 1548-10 | \n",
+ " month | \n",
+ " 1236-11-30/1236-12-28 | \n",
+ " 1236-12-07 | \n",
+ " 1237-01-04 | \n",
+ "
\n",
+ " \n",
+ " | 47 | \n",
+ " 506 | \n",
+ " Elul 1428 | \n",
+ " Seleucid | \n",
+ " 1428-06 | \n",
+ " month | \n",
+ " 1117-08-01/1117-08-29 | \n",
+ " 1117-08-08 | \n",
+ " 1117-09-05 | \n",
+ "
\n",
+ " \n",
+ " | 55 | \n",
+ " 516 | \n",
+ " First decade of Ḥeshvan 1442 | \n",
+ " Seleucid | \n",
+ " 1442-08 | \n",
+ " month | \n",
+ " 1130-10-06/1130-10-15 | \n",
+ " 1130-10-13 | \n",
+ " 1130-11-10 | \n",
+ "
\n",
+ " \n",
+ " | 73 | \n",
+ " 537 | \n",
+ " Ḥeshvan 1453 | \n",
+ " Seleucid | \n",
+ " 1453-08 | \n",
+ " month | \n",
+ " 1141 | \n",
+ " 1141-10-11 | \n",
+ " 1141-11-08 | \n",
+ "
\n",
+ " \n",
+ " | 75 | \n",
+ " 544 | \n",
+ " Sunday, 21 Kislev 1355 | \n",
+ " Seleucid | \n",
+ " 1355-09-21 | \n",
+ " day | \n",
+ " 1043-11-26 | \n",
+ " 1043-12-02 | \n",
+ " 1043-12-02 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pgpid doc_date_original doc_date_calendar undate_orig \\\n",
+ "5 449 1570 Seleucid 1570 \n",
+ "16 463 19 Adar 1427 Seleucid 1427-12-19 \n",
+ "17 464 Tammuz 1288 Seleucid 1288-04 \n",
+ "23 472 1337 Seleucid 1337 \n",
+ "41 499 Wednesday, 15 Kislev 1500 Seleucid 1500-09-15 \n",
+ "43 502 Tevet 1548 Seleucid 1548-10 \n",
+ "47 506 Elul 1428 Seleucid 1428-06 \n",
+ "55 516 First decade of Ḥeshvan 1442 Seleucid 1442-08 \n",
+ "73 537 Ḥeshvan 1453 Seleucid 1453-08 \n",
+ "75 544 Sunday, 21 Kislev 1355 Seleucid 1355-09-21 \n",
+ "\n",
+ " orig_date_precision doc_date_standard undate_earliest undate_latest \n",
+ "5 year 1259 1258-09-07 1259-09-26 \n",
+ "16 day 1116-03-05 1116-03-12 1116-03-12 \n",
+ "17 month 0977-06-21/0977-07-19 0977-06-26 0977-07-24 \n",
+ "23 year 1025-08-28/1026-09-14 1025-09-03 1026-09-20 \n",
+ "41 day 1188-12-07 1188-12-14 1188-12-14 \n",
+ "43 month 1236-11-30/1236-12-28 1236-12-07 1237-01-04 \n",
+ "47 month 1117-08-01/1117-08-29 1117-08-08 1117-09-05 \n",
+ "55 month 1130-10-06/1130-10-15 1130-10-13 1130-11-10 \n",
+ "73 month 1141 1141-10-11 1141-11-08 \n",
+ "75 day 1043-11-26 1043-12-02 1043-12-02 "
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "seleucid_dates = orig_dates_parsed[orig_dates_parsed.doc_date_calendar == 'Seleucid'].copy()\n",
+ "# add undate earliest/latest (Gregorian) for comparison with dataset standardized date \n",
+ "seleucid_dates['undate_earliest'] = seleucid_dates.undate_orig.apply(lambda x: x.earliest)\n",
+ "seleucid_dates['undate_latest'] = seleucid_dates.undate_orig.apply(lambda x: x.latest)\n",
+ "\n",
+ "seleucid_dates[['pgpid', 'doc_date_original', 'doc_date_calendar', 'undate_orig', 'orig_date_precision', 'doc_date_standard', 'undate_earliest', 'undate_latest']].head(10)\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "id": "a104d772-6c2c-4711-91ec-8cf1f108ae23",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# can we sort by parsed original dates? \n",
+ "# doesn't work currently because of overlapping dates / different granularity\n",
+ "#orig_dates_parsed.sort_values(by='undate_orig') #, key=lambda col: col.value.earliest)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "29f5f6eb-9b7d-4a4a-815a-29002d1d024b",
+ "metadata": {},
+ "source": [
+ "## Plot documents by date\n",
+ "\n",
+ "For the dates we could parse, how are the documents distributed over time and calendar?\n",
+ "\n",
+ "First let's graph by year based on the midpoint of the date range."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "id": "c653d928-8fec-4ddc-9abf-ace2f7ca6629",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# set earliest/latest for graphing\n",
+ "\n",
+ "# NOTE: we have to cast type to something pandas/altair supports\n",
+ "\n",
+ "orig_dates_parsed['orig_date_earliest'] = orig_dates_parsed.undate_orig.apply(lambda x: x.earliest).astype('datetime64[s]')\n",
+ "orig_dates_parsed['orig_date_latest'] = orig_dates_parsed.undate_orig.apply(lambda x: x.latest).astype('datetime64[s]')\n",
+ "orig_dates_parsed['orig_date_mid'] = orig_dates_parsed.undate_orig.apply(lambda x: x.earliest + (x.latest - x.earliest)/2).astype('datetime64[s]')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "id": "91f155fe-d0e6-4ee4-99de-698ac301e3f3",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " orig_date_earliest | \n",
+ " orig_date_latest | \n",
+ " orig_date_mid | \n",
+ " pgpid | \n",
+ " doc_date_calendar | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 5 | \n",
+ " 1258-09-07 | \n",
+ " 1259-09-26 | \n",
+ " 1259-03-18 | \n",
+ " 449 | \n",
+ " Seleucid | \n",
+ "
\n",
+ " \n",
+ " | 16 | \n",
+ " 1116-03-12 | \n",
+ " 1116-03-12 | \n",
+ " 1116-03-12 | \n",
+ " 463 | \n",
+ " Seleucid | \n",
+ "
\n",
+ " \n",
+ " | 17 | \n",
+ " 977-06-26 | \n",
+ " 977-07-24 | \n",
+ " 977-07-10 | \n",
+ " 464 | \n",
+ " Seleucid | \n",
+ "
\n",
+ " \n",
+ " | 23 | \n",
+ " 1025-09-03 | \n",
+ " 1026-09-20 | \n",
+ " 1026-03-13 | \n",
+ " 472 | \n",
+ " Seleucid | \n",
+ "
\n",
+ " \n",
+ " | 41 | \n",
+ " 1188-12-14 | \n",
+ " 1188-12-14 | \n",
+ " 1188-12-14 | \n",
+ " 499 | \n",
+ " Seleucid | \n",
+ "
\n",
+ " \n",
+ " | 43 | \n",
+ " 1236-12-07 | \n",
+ " 1237-01-04 | \n",
+ " 1236-12-21 | \n",
+ " 502 | \n",
+ " Seleucid | \n",
+ "
\n",
+ " \n",
+ " | 47 | \n",
+ " 1117-08-08 | \n",
+ " 1117-09-05 | \n",
+ " 1117-08-22 | \n",
+ " 506 | \n",
+ " Seleucid | \n",
+ "
\n",
+ " \n",
+ " | 55 | \n",
+ " 1130-10-13 | \n",
+ " 1130-11-10 | \n",
+ " 1130-10-27 | \n",
+ " 516 | \n",
+ " Seleucid | \n",
+ "
\n",
+ " \n",
+ " | 61 | \n",
+ " 1035-05-28 | \n",
+ " 1035-05-28 | \n",
+ " 1035-05-28 | \n",
+ " 524 | \n",
+ " Anno Mundi | \n",
+ "
\n",
+ " \n",
+ " | 62 | \n",
+ " 1034-08-25 | \n",
+ " 1034-09-22 | \n",
+ " 1034-09-08 | \n",
+ " 525 | \n",
+ " Hijrī | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " orig_date_earliest orig_date_latest orig_date_mid pgpid doc_date_calendar\n",
+ "5 1258-09-07 1259-09-26 1259-03-18 449 Seleucid\n",
+ "16 1116-03-12 1116-03-12 1116-03-12 463 Seleucid\n",
+ "17 977-06-26 977-07-24 977-07-10 464 Seleucid\n",
+ "23 1025-09-03 1026-09-20 1026-03-13 472 Seleucid\n",
+ "41 1188-12-14 1188-12-14 1188-12-14 499 Seleucid\n",
+ "43 1236-12-07 1237-01-04 1236-12-21 502 Seleucid\n",
+ "47 1117-08-08 1117-09-05 1117-08-22 506 Seleucid\n",
+ "55 1130-10-13 1130-11-10 1130-10-27 516 Seleucid\n",
+ "61 1035-05-28 1035-05-28 1035-05-28 524 Anno Mundi\n",
+ "62 1034-08-25 1034-09-22 1034-09-08 525 Hijrī"
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "orig_dates_parsed[['orig_date_earliest', 'orig_date_latest', 'orig_date_mid', 'pgpid', 'doc_date_calendar']].head(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "id": "144b2a4a-81cf-4a6d-a277-3a7910354a77",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.Chart(...)"
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# graph documents by calendar\n",
+ "import altair as alt\n",
+ "\n",
+ "date_docs_cal = orig_dates_parsed[orig_dates_parsed.doc_date_standard.notna()]\n",
+ "\n",
+ "dated_docs_cal = date_docs_cal.fillna({'doc_date_calendar': 'Unspecified'})\n",
+ "dated_docs_cal['midpoint_year'] = dated_docs_cal.orig_date_mid.apply(lambda x: x.year)\n",
+ "\n",
+ "orig_dates_calendars_chart = alt.Chart(dated_docs_cal[['pgpid', 'midpoint_year', 'doc_date_calendar']]).mark_area(opacity=0.7).encode(\n",
+ " x=alt.X('midpoint_year', title=\"Year (midpoint)\", bin=alt.Bin(maxbins=120), axis=alt.Axis(format=\"r\")),\n",
+ " y=alt.Y('count(pgpid)', title='Documents'),\n",
+ " color=alt.Y(\"doc_date_calendar\", title=\"Calendar\")\n",
+ ").properties(width=900, height=200, title=\"Documents by calendar (original date)\")\n",
+ "\n",
+ "orig_dates_calendars_chart"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a8e8cd7c-0711-40ae-84f6-d3f8df6d5ccc",
+ "metadata": {},
+ "source": [
+ "For comparison, what does it look like if we graph by the standardized dates in the dataset?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "id": "4acc9a2b-d403-4f93-b2c5-6fee92ead105",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.Chart(...)"
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# graph documents with calendars\n",
+ "\n",
+ "def undate_midpoint(value):\n",
+ " # parsed standard date could be an undate or an interval; handle either\n",
+ " if isinstance(value, Undate):\n",
+ " earliest = value.earliest\n",
+ " latest = value.latest\n",
+ " else: # interval\n",
+ " earliest = value.earliest.earliest\n",
+ " latest = value.latest.latest\n",
+ " return earliest + (latest - earliest)/2\n",
+ " \n",
+ "\n",
+ "dated_docs_cal = docs_with_docdate.copy()\n",
+ "dated_docs_cal = dated_docs_cal.fillna({'doc_date_calendar': 'Unspecified'})\n",
+ "# get the midpoint from the parsed standard date; convert to supported type\n",
+ "dated_docs_cal['midpoint'] = dated_docs_cal.undate_standard.apply(lambda x: undate_midpoint(x) if pd.notna(x) else None).astype(\"datetime64[s]\")\n",
+ "dated_docs_cal['midpoint_year'] = dated_docs_cal.midpoint.apply(lambda x: x.year if pd.notna(x) else None)\n",
+ "\n",
+ "\n",
+ "std_dates_calendars_chart = alt.Chart(dated_docs_cal[['pgpid', 'midpoint_year', 'doc_date_calendar']]).mark_area(opacity=0.7).encode(\n",
+ " x=alt.X('midpoint_year', title=\"Year\", bin=alt.Bin(maxbins=120), axis=alt.Axis(format=\"r\")),\n",
+ " y=alt.Y('count(pgpid)', title='Documents'),\n",
+ " color=alt.Y(\"doc_date_calendar\", title=\"Calendar\").scale(domain=['Anno Mundi', 'Hijrī', 'Seleucid', 'Kharājī', 'Unspecified'])\n",
+ ").properties(width=900, height=200, title=\"Documents by calendar (standard date)\")\n",
+ "\n",
+ "std_dates_calendars_chart"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f42471a4-0c64-4237-92c0-0d201377fa9f",
+ "metadata": {},
+ "source": [
+ "Here are the two plots together. The unspecified calendars are most likely Julian/Gregorian dates."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "id": "4d7c4d5f-636c-42a0-a906-21c67f5781b8",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.VConcatChart(...)"
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "orig_dates_calendars_chart & std_dates_calendars_chart"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "dc8a4617-ca69-4494-a2ef-6f4d442b82e6",
+ "metadata": {},
+ "source": [
+ "We can try graphing by range, but our parsing currently excludes the original dates with larger ranges."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "id": "c5861110-dbd5-4d7a-8ada-acf7cb871aa7",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.VConcatChart(...)"
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "graphable_data = orig_dates_parsed[['orig_date_earliest', 'orig_date_latest', 'orig_date_mid', 'pgpid', 'doc_date_calendar']].copy()\n",
+ "# graphable_data['midpoint'] = graphable_data.undate_standard.apply(lambda x: undate_midpoint(x) if pd.notna(x) else None).astype(\"datetime64[s]\")\n",
+ "graphable_data['midpoint_year'] = graphable_data.orig_date_mid.apply(lambda x: x.year if pd.notna(x) else None)\n",
+ "\n",
+ "\n",
+ "bar_chart = alt.Chart(graphable_data).mark_bar(opacity=0.5).encode(\n",
+ " x=alt.X('orig_date_earliest:T', title=\"original date (range)\"), # , axis=alt.Axis(format=\"r\")),\n",
+ " x2='orig_date_latest:T',\n",
+ " y=alt.Y('count(pgpid)', title='Count of Documents')\n",
+ ").properties(width=1200, height=150)\n",
+ "\n",
+ "line_chart = alt.Chart(graphable_data).mark_line(opacity=0.6, color=\"green\", interpolate=\"monotone\").encode(\n",
+ " x=alt.X('orig_date_mid:T', title=\"Year (midpoint)\"),\n",
+ " y=alt.Y('count(pgpid)', title='Documents')\n",
+ ").properties(width=1200, height=150)\n",
+ "\n",
+ "(bar_chart & line_chart).properties(title=\"Documents by date (1000-1300)\").interactive()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "951d92ea-4689-481c-8590-324b782a7a1c",
+ "metadata": {},
+ "source": [
+ "## Compare weekdays\n",
+ "\n",
+ "Sometimes the original date includes a day of the week; we don't expect these to be completely reliable, but lets compare the weekdays in the original date with the weekday as determined by the parsed `Undate`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "id": "3122a874-bb17-429f-993f-4bf7a76c1a36",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pgpid | \n",
+ " doc_date_original | \n",
+ " doc_date_calendar | \n",
+ " doc_date_standard | \n",
+ " undate_standard | \n",
+ " undate_orig | \n",
+ " orig_date_precision | \n",
+ " type | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 851 | \n",
+ " 1377 | \n",
+ " Wednesday night, 28 Sivan 1581 | \n",
+ " Seleucid | \n",
+ " 1270 | \n",
+ " 1270 | \n",
+ " 1581-03-28 | \n",
+ " day | \n",
+ " Legal document | \n",
+ "
\n",
+ " \n",
+ " | 1714 | \n",
+ " 2418 | \n",
+ " Monday 20 Tevet 1520 | \n",
+ " Seleucid | \n",
+ " 1208-12-29 | \n",
+ " 1208-12-29 | \n",
+ " 1520-10-20 | \n",
+ " day | \n",
+ " Legal document | \n",
+ "
\n",
+ " \n",
+ " | 1929 | \n",
+ " 2649 | \n",
+ " Sunday night, 25 Kislev 1444 | \n",
+ " Seleucid | \n",
+ " 1133 | \n",
+ " 1133 | \n",
+ " 1444-09-25 | \n",
+ " day | \n",
+ " Legal document | \n",
+ "
\n",
+ " \n",
+ " | 2013 | \n",
+ " 2739 | \n",
+ " Wednesday 29th Elul 1354 | \n",
+ " Seleucid | \n",
+ " 1043-09-07 | \n",
+ " 1043-09-07 | \n",
+ " 1354-06-29 | \n",
+ " day | \n",
+ " Legal document | \n",
+ "
\n",
+ " \n",
+ " | 3257 | \n",
+ " 4026 | \n",
+ " Wednesday night, 29 Tishrei 1541 | \n",
+ " Seleucid | \n",
+ " 1229-09-18 | \n",
+ " 1229-09-18 | \n",
+ " 1541-07-29 | \n",
+ " day | \n",
+ " Legal document | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 29303 | \n",
+ " 34623 | \n",
+ " Sunday night, 20 Ṭevet 1578 | \n",
+ " Seleucid | \n",
+ " 1266/1267 | \n",
+ " 1266/1267 | \n",
+ " 1578-10-20 | \n",
+ " day | \n",
+ " Legal document | \n",
+ "
\n",
+ " \n",
+ " | 29924 | \n",
+ " 35264 | \n",
+ " Wednesday 13 Ṭevet 1526 | \n",
+ " Seleucid | \n",
+ " 1214/1215 | \n",
+ " 1214/1215 | \n",
+ " 1526-10-13 | \n",
+ " day | \n",
+ " Legal document | \n",
+ "
\n",
+ " \n",
+ " | 34008 | \n",
+ " 39564 | \n",
+ " Monday 16 Tevet 1339 | \n",
+ " Seleucid | \n",
+ " 1027-12-18 | \n",
+ " 1027-12-18 | \n",
+ " 1339-10-16 | \n",
+ " day | \n",
+ " Legal document | \n",
+ "
\n",
+ " \n",
+ " | 34466 | \n",
+ " 40035 | \n",
+ " Monday 1st Iyyar 1437 | \n",
+ " Seleucid | \n",
+ " 1126-04-26 | \n",
+ " 1126-04-26 | \n",
+ " 1437-02-01 | \n",
+ " day | \n",
+ " Legal document | \n",
+ "
\n",
+ " \n",
+ " | 34467 | \n",
+ " 40036 | \n",
+ " Friday 15 of Adar 1443 | \n",
+ " Seleucid | \n",
+ " 1132-03-04 | \n",
+ " 1132-03-04 | \n",
+ " 1443-12-15 | \n",
+ " day | \n",
+ " Legal document | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
104 rows × 8 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pgpid doc_date_original doc_date_calendar \\\n",
+ "851 1377 Wednesday night, 28 Sivan 1581 Seleucid \n",
+ "1714 2418 Monday 20 Tevet 1520 Seleucid \n",
+ "1929 2649 Sunday night, 25 Kislev 1444 Seleucid \n",
+ "2013 2739 Wednesday 29th Elul 1354 Seleucid \n",
+ "3257 4026 Wednesday night, 29 Tishrei 1541 Seleucid \n",
+ "... ... ... ... \n",
+ "29303 34623 Sunday night, 20 Ṭevet 1578 Seleucid \n",
+ "29924 35264 Wednesday 13 Ṭevet 1526 Seleucid \n",
+ "34008 39564 Monday 16 Tevet 1339 Seleucid \n",
+ "34466 40035 Monday 1st Iyyar 1437 Seleucid \n",
+ "34467 40036 Friday 15 of Adar 1443 Seleucid \n",
+ "\n",
+ " doc_date_standard undate_standard undate_orig orig_date_precision \\\n",
+ "851 1270 1270 1581-03-28 day \n",
+ "1714 1208-12-29 1208-12-29 1520-10-20 day \n",
+ "1929 1133 1133 1444-09-25 day \n",
+ "2013 1043-09-07 1043-09-07 1354-06-29 day \n",
+ "3257 1229-09-18 1229-09-18 1541-07-29 day \n",
+ "... ... ... ... ... \n",
+ "29303 1266/1267 1266/1267 1578-10-20 day \n",
+ "29924 1214/1215 1214/1215 1526-10-13 day \n",
+ "34008 1027-12-18 1027-12-18 1339-10-16 day \n",
+ "34466 1126-04-26 1126-04-26 1437-02-01 day \n",
+ "34467 1132-03-04 1132-03-04 1443-12-15 day \n",
+ "\n",
+ " type \n",
+ "851 Legal document \n",
+ "1714 Legal document \n",
+ "1929 Legal document \n",
+ "2013 Legal document \n",
+ "3257 Legal document \n",
+ "... ... \n",
+ "29303 Legal document \n",
+ "29924 Legal document \n",
+ "34008 Legal document \n",
+ "34466 Legal document \n",
+ "34467 Legal document \n",
+ "\n",
+ "[104 rows x 8 columns]"
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "weekday_dates = orig_dates_parsed[orig_dates_parsed.doc_date_original.str.contains('day ')][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard', 'undate_standard', 'undate_orig', 'orig_date_precision', 'type']]\n",
+ "weekday_dates"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d9c03fd7-731c-44ce-ae2d-0bc1308790d0",
+ "metadata": {},
+ "source": [
+ "Extract the weekday from the original date and determine the undate weekday.\n",
+ "\n",
+ "Both Arabic and Hebrew days begin in the evening, so if the date string includes the text \"night\" we shift the original day by one for comparison."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "id": "3e4ea50c-b11c-433b-b6f9-691098b057d3",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pgpid | \n",
+ " doc_date_original | \n",
+ " doc_date_calendar | \n",
+ " doc_date_standard | \n",
+ " undate_standard | \n",
+ " undate_orig | \n",
+ " orig_date_precision | \n",
+ " type | \n",
+ " undate_weekday | \n",
+ " undate_weekday_name | \n",
+ " orig_weekday | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 851 | \n",
+ " 1377 | \n",
+ " Wednesday night, 28 Sivan 1581 | \n",
+ " Seleucid | \n",
+ " 1270 | \n",
+ " 1270 | \n",
+ " 1581-03-28 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 3 | \n",
+ " Thursday | \n",
+ " Thursday | \n",
+ "
\n",
+ " \n",
+ " | 1714 | \n",
+ " 2418 | \n",
+ " Monday 20 Tevet 1520 | \n",
+ " Seleucid | \n",
+ " 1208-12-29 | \n",
+ " 1208-12-29 | \n",
+ " 1520-10-20 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 0 | \n",
+ " Monday | \n",
+ " Monday | \n",
+ "
\n",
+ " \n",
+ " | 1929 | \n",
+ " 2649 | \n",
+ " Sunday night, 25 Kislev 1444 | \n",
+ " Seleucid | \n",
+ " 1133 | \n",
+ " 1133 | \n",
+ " 1444-09-25 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 0 | \n",
+ " Monday | \n",
+ " Monday | \n",
+ "
\n",
+ " \n",
+ " | 2013 | \n",
+ " 2739 | \n",
+ " Wednesday 29th Elul 1354 | \n",
+ " Seleucid | \n",
+ " 1043-09-07 | \n",
+ " 1043-09-07 | \n",
+ " 1354-06-29 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 2 | \n",
+ " Wednesday | \n",
+ " Wednesday | \n",
+ "
\n",
+ " \n",
+ " | 3257 | \n",
+ " 4026 | \n",
+ " Wednesday night, 29 Tishrei 1541 | \n",
+ " Seleucid | \n",
+ " 1229-09-18 | \n",
+ " 1229-09-18 | \n",
+ " 1541-07-29 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 3 | \n",
+ " Thursday | \n",
+ " Thursday | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 29303 | \n",
+ " 34623 | \n",
+ " Sunday night, 20 Ṭevet 1578 | \n",
+ " Seleucid | \n",
+ " 1266/1267 | \n",
+ " 1266/1267 | \n",
+ " 1578-10-20 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 0 | \n",
+ " Monday | \n",
+ " Monday | \n",
+ "
\n",
+ " \n",
+ " | 29924 | \n",
+ " 35264 | \n",
+ " Wednesday 13 Ṭevet 1526 | \n",
+ " Seleucid | \n",
+ " 1214/1215 | \n",
+ " 1214/1215 | \n",
+ " 1526-10-13 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 2 | \n",
+ " Wednesday | \n",
+ " Wednesday | \n",
+ "
\n",
+ " \n",
+ " | 34008 | \n",
+ " 39564 | \n",
+ " Monday 16 Tevet 1339 | \n",
+ " Seleucid | \n",
+ " 1027-12-18 | \n",
+ " 1027-12-18 | \n",
+ " 1339-10-16 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 0 | \n",
+ " Monday | \n",
+ " Monday | \n",
+ "
\n",
+ " \n",
+ " | 34466 | \n",
+ " 40035 | \n",
+ " Monday 1st Iyyar 1437 | \n",
+ " Seleucid | \n",
+ " 1126-04-26 | \n",
+ " 1126-04-26 | \n",
+ " 1437-02-01 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 0 | \n",
+ " Monday | \n",
+ " Monday | \n",
+ "
\n",
+ " \n",
+ " | 34467 | \n",
+ " 40036 | \n",
+ " Friday 15 of Adar 1443 | \n",
+ " Seleucid | \n",
+ " 1132-03-04 | \n",
+ " 1132-03-04 | \n",
+ " 1443-12-15 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 4 | \n",
+ " Friday | \n",
+ " Friday | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
104 rows × 11 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pgpid doc_date_original doc_date_calendar \\\n",
+ "851 1377 Wednesday night, 28 Sivan 1581 Seleucid \n",
+ "1714 2418 Monday 20 Tevet 1520 Seleucid \n",
+ "1929 2649 Sunday night, 25 Kislev 1444 Seleucid \n",
+ "2013 2739 Wednesday 29th Elul 1354 Seleucid \n",
+ "3257 4026 Wednesday night, 29 Tishrei 1541 Seleucid \n",
+ "... ... ... ... \n",
+ "29303 34623 Sunday night, 20 Ṭevet 1578 Seleucid \n",
+ "29924 35264 Wednesday 13 Ṭevet 1526 Seleucid \n",
+ "34008 39564 Monday 16 Tevet 1339 Seleucid \n",
+ "34466 40035 Monday 1st Iyyar 1437 Seleucid \n",
+ "34467 40036 Friday 15 of Adar 1443 Seleucid \n",
+ "\n",
+ " doc_date_standard undate_standard undate_orig orig_date_precision \\\n",
+ "851 1270 1270 1581-03-28 day \n",
+ "1714 1208-12-29 1208-12-29 1520-10-20 day \n",
+ "1929 1133 1133 1444-09-25 day \n",
+ "2013 1043-09-07 1043-09-07 1354-06-29 day \n",
+ "3257 1229-09-18 1229-09-18 1541-07-29 day \n",
+ "... ... ... ... ... \n",
+ "29303 1266/1267 1266/1267 1578-10-20 day \n",
+ "29924 1214/1215 1214/1215 1526-10-13 day \n",
+ "34008 1027-12-18 1027-12-18 1339-10-16 day \n",
+ "34466 1126-04-26 1126-04-26 1437-02-01 day \n",
+ "34467 1132-03-04 1132-03-04 1443-12-15 day \n",
+ "\n",
+ " type undate_weekday undate_weekday_name orig_weekday \n",
+ "851 Legal document 3 Thursday Thursday \n",
+ "1714 Legal document 0 Monday Monday \n",
+ "1929 Legal document 0 Monday Monday \n",
+ "2013 Legal document 2 Wednesday Wednesday \n",
+ "3257 Legal document 3 Thursday Thursday \n",
+ "... ... ... ... ... \n",
+ "29303 Legal document 0 Monday Monday \n",
+ "29924 Legal document 2 Wednesday Wednesday \n",
+ "34008 Legal document 0 Monday Monday \n",
+ "34466 Legal document 0 Monday Monday \n",
+ "34467 Legal document 4 Friday Friday \n",
+ "\n",
+ "[104 rows x 11 columns]"
+ ]
+ },
+ "execution_count": 28,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "days = [\"Monday\", \"Tuesday\", \"Wednesday\", \"Thursday\", \"Friday\", \"Saturday\", \"Sunday\"]\n",
+ "\n",
+ "# get numeric weekday; since these dates are all day-precision we can just use the earliest date\n",
+ "weekday_dates['undate_weekday'] = weekday_dates.undate_orig.apply(lambda x: x.earliest.weekday)\n",
+ "weekday_dates['undate_weekday_name'] = weekday_dates.undate_weekday.apply(lambda x: days[x])\n",
+ "# extract weekday from date label\n",
+ "weekday_dates['orig_weekday'] = weekday_dates.doc_date_original.str.extract('([a-zA-Z]+day)', expand=False).str.strip()\n",
+ "# correct misspellings\n",
+ "misspelled_days = {\n",
+ " \"Wedensday\": \"Wednesday\",\n",
+ " \"Thrusday\": \"Thursday\",\n",
+ "}\n",
+ "weekday_dates['orig_weekday'] = weekday_dates.orig_weekday.apply(lambda x: misspelled_days.get(x, x))\n",
+ "\n",
+ "# shift night to next day, e.g. Wednesday night should be Thursday\n",
+ "# NOTE: this must be done immediately after the day extraction, otherwise repeated runs continue shifting to the next day\n",
+ "def next_day(weekday):\n",
+ " return days[(days.index(weekday) +1) % 7]\n",
+ "\n",
+ "weekday_dates['orig_weekday'] = weekday_dates.apply(lambda row: next_day(row.orig_weekday) if \" night\" in row.doc_date_original else row.orig_weekday, axis=1)\n",
+ "\n",
+ "weekday_dates"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c3ab3428-9700-4e57-b3ff-329c737d98f7",
+ "metadata": {},
+ "source": [
+ "Here are the subset of records that specify \"night\":"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "id": "4ced7809-1414-44ae-aae7-c2d0d1dee9ad",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pgpid | \n",
+ " doc_date_original | \n",
+ " doc_date_calendar | \n",
+ " doc_date_standard | \n",
+ " undate_standard | \n",
+ " undate_orig | \n",
+ " orig_date_precision | \n",
+ " type | \n",
+ " undate_weekday | \n",
+ " undate_weekday_name | \n",
+ " orig_weekday | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 851 | \n",
+ " 1377 | \n",
+ " Wednesday night, 28 Sivan 1581 | \n",
+ " Seleucid | \n",
+ " 1270 | \n",
+ " 1270 | \n",
+ " 1581-03-28 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 3 | \n",
+ " Thursday | \n",
+ " Thursday | \n",
+ "
\n",
+ " \n",
+ " | 1929 | \n",
+ " 2649 | \n",
+ " Sunday night, 25 Kislev 1444 | \n",
+ " Seleucid | \n",
+ " 1133 | \n",
+ " 1133 | \n",
+ " 1444-09-25 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 0 | \n",
+ " Monday | \n",
+ " Monday | \n",
+ "
\n",
+ " \n",
+ " | 3257 | \n",
+ " 4026 | \n",
+ " Wednesday night, 29 Tishrei 1541 | \n",
+ " Seleucid | \n",
+ " 1229-09-18 | \n",
+ " 1229-09-18 | \n",
+ " 1541-07-29 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 3 | \n",
+ " Thursday | \n",
+ " Thursday | \n",
+ "
\n",
+ " \n",
+ " | 5511 | \n",
+ " 7237 | \n",
+ " Tuesday night, 22 Kislev 1435 | \n",
+ " Seleucid | \n",
+ " 1123-12-12 | \n",
+ " 1123-12-12 | \n",
+ " 1435-09-22 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 2 | \n",
+ " Wednesday | \n",
+ " Wednesday | \n",
+ "
\n",
+ " \n",
+ " | 5854 | \n",
+ " 7637 | \n",
+ " Monday night, 29 Ṭevet 1438 | \n",
+ " Seleucid | \n",
+ " 1127 | \n",
+ " 1127 | \n",
+ " 1438-10-29 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 4 | \n",
+ " Friday | \n",
+ " Tuesday | \n",
+ "
\n",
+ " \n",
+ " | 5857 | \n",
+ " 7642 | \n",
+ " Thursday night, 23 Tammuz 1538 | \n",
+ " Seleucid | \n",
+ " 1227-07-09 | \n",
+ " 1227-07-09 | \n",
+ " 1538-04-23 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 4 | \n",
+ " Friday | \n",
+ " Friday | \n",
+ "
\n",
+ " \n",
+ " | 6419 | \n",
+ " 8332 | \n",
+ " Friday night, 20 Iyar 4957 | \n",
+ " Anno Mundi | \n",
+ " 1197-05 | \n",
+ " 1197-05 | \n",
+ " 4957-02-20 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 5 | \n",
+ " Saturday | \n",
+ " Saturday | \n",
+ "
\n",
+ " \n",
+ " | 29303 | \n",
+ " 34623 | \n",
+ " Sunday night, 20 Ṭevet 1578 | \n",
+ " Seleucid | \n",
+ " 1266/1267 | \n",
+ " 1266/1267 | \n",
+ " 1578-10-20 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 0 | \n",
+ " Monday | \n",
+ " Monday | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pgpid doc_date_original doc_date_calendar \\\n",
+ "851 1377 Wednesday night, 28 Sivan 1581 Seleucid \n",
+ "1929 2649 Sunday night, 25 Kislev 1444 Seleucid \n",
+ "3257 4026 Wednesday night, 29 Tishrei 1541 Seleucid \n",
+ "5511 7237 Tuesday night, 22 Kislev 1435 Seleucid \n",
+ "5854 7637 Monday night, 29 Ṭevet 1438 Seleucid \n",
+ "5857 7642 Thursday night, 23 Tammuz 1538 Seleucid \n",
+ "6419 8332 Friday night, 20 Iyar 4957 Anno Mundi \n",
+ "29303 34623 Sunday night, 20 Ṭevet 1578 Seleucid \n",
+ "\n",
+ " doc_date_standard undate_standard undate_orig orig_date_precision \\\n",
+ "851 1270 1270 1581-03-28 day \n",
+ "1929 1133 1133 1444-09-25 day \n",
+ "3257 1229-09-18 1229-09-18 1541-07-29 day \n",
+ "5511 1123-12-12 1123-12-12 1435-09-22 day \n",
+ "5854 1127 1127 1438-10-29 day \n",
+ "5857 1227-07-09 1227-07-09 1538-04-23 day \n",
+ "6419 1197-05 1197-05 4957-02-20 day \n",
+ "29303 1266/1267 1266/1267 1578-10-20 day \n",
+ "\n",
+ " type undate_weekday undate_weekday_name orig_weekday \n",
+ "851 Legal document 3 Thursday Thursday \n",
+ "1929 Legal document 0 Monday Monday \n",
+ "3257 Legal document 3 Thursday Thursday \n",
+ "5511 Legal document 2 Wednesday Wednesday \n",
+ "5854 Legal document 4 Friday Tuesday \n",
+ "5857 Legal document 4 Friday Friday \n",
+ "6419 Legal document 5 Saturday Saturday \n",
+ "29303 Legal document 0 Monday Monday "
+ ]
+ },
+ "execution_count": 29,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "weekday_dates[weekday_dates.doc_date_original.str.contains(\" night\")]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "94b8aae8-6bc1-425c-b723-427356cfb647",
+ "metadata": {},
+ "source": [
+ "How many of the original and undate weekdays match?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "id": "fedb5323-0e9c-476e-a7e2-95443d2f9e1d",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "44 matches, 60 mismatches (42.31%)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pgpid | \n",
+ " doc_date_original | \n",
+ " doc_date_calendar | \n",
+ " doc_date_standard | \n",
+ " undate_standard | \n",
+ " undate_orig | \n",
+ " orig_date_precision | \n",
+ " type | \n",
+ " undate_weekday | \n",
+ " undate_weekday_name | \n",
+ " orig_weekday | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 5271 | \n",
+ " 6947 | \n",
+ " Monday 3 Iyyar 1740 | \n",
+ " Seleucid | \n",
+ " 1429-04-07 | \n",
+ " 1429-04-07 | \n",
+ " 1740-02-03 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 3 | \n",
+ " Thursday | \n",
+ " Monday | \n",
+ "
\n",
+ " \n",
+ " | 5854 | \n",
+ " 7637 | \n",
+ " Monday night, 29 Ṭevet 1438 | \n",
+ " Seleucid | \n",
+ " 1127 | \n",
+ " 1127 | \n",
+ " 1438-10-29 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 4 | \n",
+ " Friday | \n",
+ " Tuesday | \n",
+ "
\n",
+ " \n",
+ " | 8648 | \n",
+ " 11227 | \n",
+ " Monday 24 Jumādā I 517 | \n",
+ " Hijrī | \n",
+ " 1123-07-20 | \n",
+ " 1123-07-20 | \n",
+ " 0517-05-24 | \n",
+ " day | \n",
+ " Paraliterary text | \n",
+ " 4 | \n",
+ " Friday | \n",
+ " Monday | \n",
+ "
\n",
+ " \n",
+ " | 16397 | \n",
+ " 19649 | \n",
+ " Thursday 26 Iyyar 5306 | \n",
+ " Anno Mundi | \n",
+ " 1546-04-28 | \n",
+ " 1546-04-28 | \n",
+ " 5306-02-26 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 2 | \n",
+ " Wednesday | \n",
+ " Thursday | \n",
+ "
\n",
+ " \n",
+ " | 17723 | \n",
+ " 21094 | \n",
+ " Saturday 20 Rajab 550 | \n",
+ " Hijrī | \n",
+ " 1155-09-19 | \n",
+ " 1155-09-19 | \n",
+ " 0550-07-20 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 0 | \n",
+ " Monday | \n",
+ " Saturday | \n",
+ "
\n",
+ " \n",
+ " | 23099 | \n",
+ " 27479 | \n",
+ " Tuesday 11 Tammuz 5525 | \n",
+ " Anno Mundi | \n",
+ " 1765-06-30 | \n",
+ " 1765-06-30 | \n",
+ " 5525-04-11 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 6 | \n",
+ " Sunday | \n",
+ " Tuesday | \n",
+ "
\n",
+ " \n",
+ " | 23104 | \n",
+ " 27484 | \n",
+ " Friday 20th Shevat 5405 | \n",
+ " Anno Mundi | \n",
+ " 1645 | \n",
+ " 1645 | \n",
+ " 5405-11-20 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 3 | \n",
+ " Thursday | \n",
+ " Friday | \n",
+ "
\n",
+ " \n",
+ " | 23105 | \n",
+ " 27485 | \n",
+ " Sunday 22 Adar 5590 | \n",
+ " Anno Mundi | \n",
+ " 1830-03-17 | \n",
+ " 1830-03-17 | \n",
+ " 5590-12-22 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 2 | \n",
+ " Wednesday | \n",
+ " Sunday | \n",
+ "
\n",
+ " \n",
+ " | 23107 | \n",
+ " 27487 | \n",
+ " Thursday 15 Shevat 5450 | \n",
+ " Anno Mundi | \n",
+ " 1690-01-25 | \n",
+ " 1690-01-25 | \n",
+ " 5450-11-15 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 2 | \n",
+ " Wednesday | \n",
+ " Thursday | \n",
+ "
\n",
+ " \n",
+ " | 23109 | \n",
+ " 27489 | \n",
+ " Sunday 6 Nisan 5528 | \n",
+ " Anno Mundi | \n",
+ " 1768-03-24 | \n",
+ " 1768-03-24 | \n",
+ " 5528-01-06 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 3 | \n",
+ " Thursday | \n",
+ " Sunday | \n",
+ "
\n",
+ " \n",
+ " | 23110 | \n",
+ " 27490 | \n",
+ " Thursday 19th Elul 5428 | \n",
+ " Anno Mundi | \n",
+ " 1668 | \n",
+ " 1668 | \n",
+ " 5428-06-19 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 6 | \n",
+ " Sunday | \n",
+ " Thursday | \n",
+ "
\n",
+ " \n",
+ " | 23111 | \n",
+ " 27491 | \n",
+ " Tuesday 1 Kislev 5507 | \n",
+ " Anno Mundi | \n",
+ " 1746-11-14 | \n",
+ " 1746-11-14 | \n",
+ " 5507-09-01 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 0 | \n",
+ " Monday | \n",
+ " Tuesday | \n",
+ "
\n",
+ " \n",
+ " | 23116 | \n",
+ " 27496 | \n",
+ " Sunday 28 Elul 5511 | \n",
+ " Anno Mundi | \n",
+ " 1751-09-18 | \n",
+ " 1751-09-18 | \n",
+ " 5511-06-28 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 5 | \n",
+ " Saturday | \n",
+ " Sunday | \n",
+ "
\n",
+ " \n",
+ " | 23117 | \n",
+ " 27497 | \n",
+ " Sunday 17th Sivan 5423 | \n",
+ " Anno Mundi | \n",
+ " 1663 | \n",
+ " 1663 | \n",
+ " 5423-03-17 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 4 | \n",
+ " Friday | \n",
+ " Sunday | \n",
+ "
\n",
+ " \n",
+ " | 23118 | \n",
+ " 27498 | \n",
+ " Sunday 25th Tevet 5409 | \n",
+ " Anno Mundi | \n",
+ " 1648 | \n",
+ " 1648 | \n",
+ " 5409-10-25 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 5 | \n",
+ " Saturday | \n",
+ " Sunday | \n",
+ "
\n",
+ " \n",
+ " | 23120 | \n",
+ " 27500 | \n",
+ " Thursday 4 Sivan 5516 | \n",
+ " Anno Mundi | \n",
+ " 1756-06-02 | \n",
+ " 1756-06-02 | \n",
+ " 5516-03-04 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 2 | \n",
+ " Wednesday | \n",
+ " Thursday | \n",
+ "
\n",
+ " \n",
+ " | 23127 | \n",
+ " 27507 | \n",
+ " Sunday 25 Sivan 5556 | \n",
+ " Anno Mundi | \n",
+ " 1796-07-01 | \n",
+ " 1796-07-01 | \n",
+ " 5556-03-25 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 4 | \n",
+ " Friday | \n",
+ " Sunday | \n",
+ "
\n",
+ " \n",
+ " | 23131 | \n",
+ " 27511 | \n",
+ " Wednesday 28th Tevet 5399 | \n",
+ " Anno Mundi | \n",
+ " 1640 | \n",
+ " 1640 | \n",
+ " 5399-10-28 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 1 | \n",
+ " Tuesday | \n",
+ " Wednesday | \n",
+ "
\n",
+ " \n",
+ " | 23135 | \n",
+ " 27515 | \n",
+ " Monday 15th Iyyar 5414 | \n",
+ " Anno Mundi | \n",
+ " 1654 | \n",
+ " 1654 | \n",
+ " 5414-02-15 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 5 | \n",
+ " Saturday | \n",
+ " Monday | \n",
+ "
\n",
+ " \n",
+ " | 23136 | \n",
+ " 27516 | \n",
+ " Thursday 24 Nisan 5481 | \n",
+ " Anno Mundi | \n",
+ " 1721-04-21 | \n",
+ " 1721-04-21 | \n",
+ " 5481-01-24 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 0 | \n",
+ " Monday | \n",
+ " Thursday | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pgpid doc_date_original doc_date_calendar doc_date_standard \\\n",
+ "5271 6947 Monday 3 Iyyar 1740 Seleucid 1429-04-07 \n",
+ "5854 7637 Monday night, 29 Ṭevet 1438 Seleucid 1127 \n",
+ "8648 11227 Monday 24 Jumādā I 517 Hijrī 1123-07-20 \n",
+ "16397 19649 Thursday 26 Iyyar 5306 Anno Mundi 1546-04-28 \n",
+ "17723 21094 Saturday 20 Rajab 550 Hijrī 1155-09-19 \n",
+ "23099 27479 Tuesday 11 Tammuz 5525 Anno Mundi 1765-06-30 \n",
+ "23104 27484 Friday 20th Shevat 5405 Anno Mundi 1645 \n",
+ "23105 27485 Sunday 22 Adar 5590 Anno Mundi 1830-03-17 \n",
+ "23107 27487 Thursday 15 Shevat 5450 Anno Mundi 1690-01-25 \n",
+ "23109 27489 Sunday 6 Nisan 5528 Anno Mundi 1768-03-24 \n",
+ "23110 27490 Thursday 19th Elul 5428 Anno Mundi 1668 \n",
+ "23111 27491 Tuesday 1 Kislev 5507 Anno Mundi 1746-11-14 \n",
+ "23116 27496 Sunday 28 Elul 5511 Anno Mundi 1751-09-18 \n",
+ "23117 27497 Sunday 17th Sivan 5423 Anno Mundi 1663 \n",
+ "23118 27498 Sunday 25th Tevet 5409 Anno Mundi 1648 \n",
+ "23120 27500 Thursday 4 Sivan 5516 Anno Mundi 1756-06-02 \n",
+ "23127 27507 Sunday 25 Sivan 5556 Anno Mundi 1796-07-01 \n",
+ "23131 27511 Wednesday 28th Tevet 5399 Anno Mundi 1640 \n",
+ "23135 27515 Monday 15th Iyyar 5414 Anno Mundi 1654 \n",
+ "23136 27516 Thursday 24 Nisan 5481 Anno Mundi 1721-04-21 \n",
+ "\n",
+ " undate_standard undate_orig orig_date_precision type \\\n",
+ "5271 1429-04-07 1740-02-03 day Legal document \n",
+ "5854 1127 1438-10-29 day Legal document \n",
+ "8648 1123-07-20 0517-05-24 day Paraliterary text \n",
+ "16397 1546-04-28 5306-02-26 day Legal document \n",
+ "17723 1155-09-19 0550-07-20 day Legal document \n",
+ "23099 1765-06-30 5525-04-11 day Legal document \n",
+ "23104 1645 5405-11-20 day Legal document \n",
+ "23105 1830-03-17 5590-12-22 day Legal document \n",
+ "23107 1690-01-25 5450-11-15 day Legal document \n",
+ "23109 1768-03-24 5528-01-06 day Legal document \n",
+ "23110 1668 5428-06-19 day Legal document \n",
+ "23111 1746-11-14 5507-09-01 day Legal document \n",
+ "23116 1751-09-18 5511-06-28 day Legal document \n",
+ "23117 1663 5423-03-17 day Legal document \n",
+ "23118 1648 5409-10-25 day Legal document \n",
+ "23120 1756-06-02 5516-03-04 day Legal document \n",
+ "23127 1796-07-01 5556-03-25 day Legal document \n",
+ "23131 1640 5399-10-28 day Legal document \n",
+ "23135 1654 5414-02-15 day Legal document \n",
+ "23136 1721-04-21 5481-01-24 day Legal document \n",
+ "\n",
+ " undate_weekday undate_weekday_name orig_weekday \n",
+ "5271 3 Thursday Monday \n",
+ "5854 4 Friday Tuesday \n",
+ "8648 4 Friday Monday \n",
+ "16397 2 Wednesday Thursday \n",
+ "17723 0 Monday Saturday \n",
+ "23099 6 Sunday Tuesday \n",
+ "23104 3 Thursday Friday \n",
+ "23105 2 Wednesday Sunday \n",
+ "23107 2 Wednesday Thursday \n",
+ "23109 3 Thursday Sunday \n",
+ "23110 6 Sunday Thursday \n",
+ "23111 0 Monday Tuesday \n",
+ "23116 5 Saturday Sunday \n",
+ "23117 4 Friday Sunday \n",
+ "23118 5 Saturday Sunday \n",
+ "23120 2 Wednesday Thursday \n",
+ "23127 4 Friday Sunday \n",
+ "23131 1 Tuesday Wednesday \n",
+ "23135 5 Saturday Monday \n",
+ "23136 0 Monday Thursday "
+ ]
+ },
+ "execution_count": 30,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "matches = weekday_dates[weekday_dates.undate_weekday_name == weekday_dates.orig_weekday]\n",
+ "\n",
+ "mismatches = weekday_dates[weekday_dates.undate_weekday_name != weekday_dates.orig_weekday]\n",
+ "\n",
+ "print(f\"{len(matches)} matches, {len(mismatches)} mismatches ({(len(matches)/(len(matches)+len(mismatches)))*100:0.2f}%)\")\n",
+ "mismatches.head(20)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "492352af-76db-47b5-afa2-f5388c4d1d71",
+ "metadata": {},
+ "source": [
+ "Is there any noticable difference about where the mismatches are coming from based on calendar or day of week?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "id": "d6476907-1628-4d68-ab1f-43c95e123707",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "doc_date_calendar\n",
+ "Anno Mundi 55\n",
+ "Seleucid 3\n",
+ "Hijrī 2\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 31,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "mismatches.doc_date_calendar.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "id": "18b71d18-5d5b-4f92-8801-499bcf412efe",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "orig_weekday\n",
+ "Wednesday 17\n",
+ "Sunday 12\n",
+ "Monday 10\n",
+ "Thursday 9\n",
+ "Tuesday 7\n",
+ "Friday 4\n",
+ "Saturday 1\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 32,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "mismatches.orig_weekday.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "id": "eb7ea065-e4b5-47aa-9538-8dc9851ea572",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "1 mismatches that include text 'night'\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pgpid | \n",
+ " doc_date_original | \n",
+ " doc_date_calendar | \n",
+ " doc_date_standard | \n",
+ " undate_standard | \n",
+ " undate_orig | \n",
+ " orig_date_precision | \n",
+ " type | \n",
+ " undate_weekday | \n",
+ " undate_weekday_name | \n",
+ " orig_weekday | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 5854 | \n",
+ " 7637 | \n",
+ " Monday night, 29 Ṭevet 1438 | \n",
+ " Seleucid | \n",
+ " 1127 | \n",
+ " 1127 | \n",
+ " 1438-10-29 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 4 | \n",
+ " Friday | \n",
+ " Tuesday | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pgpid doc_date_original doc_date_calendar doc_date_standard \\\n",
+ "5854 7637 Monday night, 29 Ṭevet 1438 Seleucid 1127 \n",
+ "\n",
+ " undate_standard undate_orig orig_date_precision type \\\n",
+ "5854 1127 1438-10-29 day Legal document \n",
+ "\n",
+ " undate_weekday undate_weekday_name orig_weekday \n",
+ "5854 4 Friday Tuesday "
+ ]
+ },
+ "execution_count": 33,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# how many mismatches are due to night?\n",
+ "night_mismatches = mismatches[mismatches.doc_date_original.str.contains(\" night\")]\n",
+ "print(f\"{len(night_mismatches)} mismatches that include text 'night'\")\n",
+ "night_mismatches"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "16f9a9db-434f-407e-8613-42941b4f3a14",
+ "metadata": {},
+ "source": [
+ "### Plot document frequency by day\n",
+ "\n",
+ "Because we're preserving as much date information as possible, we can plost based on things like weekday - even across different calendars.\n",
+ "\n",
+ "For documents with day-level date precision, how are they distributed by weekday?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "id": "ece780b8-2eb2-4cbc-9195-27def665f7fa",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.Chart(...)"
+ ]
+ },
+ "execution_count": 34,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# get numeric weekday\n",
+ "orig_dates_parsed['undate_weekday'] = orig_dates_parsed.undate_orig.apply(lambda x: x.earliest.weekday)\n",
+ "orig_dates_parsed['undate_weekday_name'] = orig_dates_parsed.undate_weekday.apply(lambda x: days[x])\n",
+ "\n",
+ "# restrict to dates with day precision; the rest are just using earliest day\n",
+ "orig_dates_days = orig_dates_parsed[orig_dates_parsed.orig_date_precision == 'day']\n",
+ "\n",
+ "alt.Chart(orig_dates_days[['undate_weekday', 'undate_weekday_name', 'pgpid']]).mark_rect().encode(\n",
+ " alt.X('undate_weekday_name', sort=days, title='weekday'),\n",
+ " alt.Color('count(pgpid)', title='# of documents')\n",
+ ").properties(title='document frequency by weekday')\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "id": "6b2f24de-18ce-4f40-b300-e8cc334a338c",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "undate_weekday_name\n",
+ "Monday 305\n",
+ "Thursday 282\n",
+ "Tuesday 241\n",
+ "Sunday 229\n",
+ "Wednesday 229\n",
+ "Friday 215\n",
+ "Saturday 98\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 35,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "orig_dates_days.undate_weekday_name.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "id": "dea83b43-b379-4807-8a33-8e26d7f4f8e7",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.FacetChart(...)"
+ ]
+ },
+ "execution_count": 36,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "weekday_calendar_chart = alt.Chart(weekday_dates[['undate_weekday', 'undate_weekday_name', 'pgpid', 'doc_date_calendar']]).mark_rect().encode(\n",
+ " alt.X('undate_weekday_name', sort=days, title='weekday'),\n",
+ " # alt.Y('doc_date_calendar'),\n",
+ " alt.Color('count(pgpid)')\n",
+ ").facet(row=alt.Facet('doc_date_calendar', title=\"Original Calendar\")).properties(title='document frequency by weekday and calendar')\n",
+ "weekday_calendar_chart"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "484069be-8f75-4197-8f96-4683ab509028",
+ "metadata": {},
+ "source": [
+ "This chart is skewed due to the fact we have so many more day-precision dates from the Hebrew calendar than any other. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "id": "cfecdb64-03b4-405b-b1f3-85e876f55680",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "doc_date_calendar\n",
+ "Anno Mundi 82\n",
+ "Seleucid 20\n",
+ "Hijrī 2\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 37,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "weekday_dates.doc_date_calendar.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bfdfcf6b-d572-4f9b-8538-eca932f50942",
+ "metadata": {},
+ "source": [
+ "This is more obvious if we use indepenend color scales."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "id": "e66917b0-2221-42dd-a99b-df847b8e815b",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.FacetChart(...)"
+ ]
+ },
+ "execution_count": 38,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "weekday_calendar_chart.resolve_scale(color='independent')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8e2a74a1-546b-4069-bff5-29788dee8997",
+ "metadata": {},
+ "source": [
+ "What about weekday by centuy? "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "id": "6a7a0bf5-f8c2-4034-8495-2fb4b297740a",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pgpid | \n",
+ " doc_date_original | \n",
+ " doc_date_calendar | \n",
+ " doc_date_standard | \n",
+ " undate_standard | \n",
+ " undate_orig | \n",
+ " orig_date_precision | \n",
+ " type | \n",
+ " undate_weekday | \n",
+ " undate_weekday_name | \n",
+ " orig_weekday | \n",
+ " century | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 851 | \n",
+ " 1377 | \n",
+ " Wednesday night, 28 Sivan 1581 | \n",
+ " Seleucid | \n",
+ " 1270 | \n",
+ " 1270 | \n",
+ " 1581-03-28 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 3 | \n",
+ " Thursday | \n",
+ " Thursday | \n",
+ " 1200s | \n",
+ "
\n",
+ " \n",
+ " | 1714 | \n",
+ " 2418 | \n",
+ " Monday 20 Tevet 1520 | \n",
+ " Seleucid | \n",
+ " 1208-12-29 | \n",
+ " 1208-12-29 | \n",
+ " 1520-10-20 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 0 | \n",
+ " Monday | \n",
+ " Monday | \n",
+ " 1200s | \n",
+ "
\n",
+ " \n",
+ " | 1929 | \n",
+ " 2649 | \n",
+ " Sunday night, 25 Kislev 1444 | \n",
+ " Seleucid | \n",
+ " 1133 | \n",
+ " 1133 | \n",
+ " 1444-09-25 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 0 | \n",
+ " Monday | \n",
+ " Monday | \n",
+ " 1100s | \n",
+ "
\n",
+ " \n",
+ " | 2013 | \n",
+ " 2739 | \n",
+ " Wednesday 29th Elul 1354 | \n",
+ " Seleucid | \n",
+ " 1043-09-07 | \n",
+ " 1043-09-07 | \n",
+ " 1354-06-29 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 2 | \n",
+ " Wednesday | \n",
+ " Wednesday | \n",
+ " 1000s | \n",
+ "
\n",
+ " \n",
+ " | 3257 | \n",
+ " 4026 | \n",
+ " Wednesday night, 29 Tishrei 1541 | \n",
+ " Seleucid | \n",
+ " 1229-09-18 | \n",
+ " 1229-09-18 | \n",
+ " 1541-07-29 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 3 | \n",
+ " Thursday | \n",
+ " Thursday | \n",
+ " 1200s | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 29303 | \n",
+ " 34623 | \n",
+ " Sunday night, 20 Ṭevet 1578 | \n",
+ " Seleucid | \n",
+ " 1266/1267 | \n",
+ " 1266/1267 | \n",
+ " 1578-10-20 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 0 | \n",
+ " Monday | \n",
+ " Monday | \n",
+ " 1200s | \n",
+ "
\n",
+ " \n",
+ " | 29924 | \n",
+ " 35264 | \n",
+ " Wednesday 13 Ṭevet 1526 | \n",
+ " Seleucid | \n",
+ " 1214/1215 | \n",
+ " 1214/1215 | \n",
+ " 1526-10-13 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 2 | \n",
+ " Wednesday | \n",
+ " Wednesday | \n",
+ " 1200s | \n",
+ "
\n",
+ " \n",
+ " | 34008 | \n",
+ " 39564 | \n",
+ " Monday 16 Tevet 1339 | \n",
+ " Seleucid | \n",
+ " 1027-12-18 | \n",
+ " 1027-12-18 | \n",
+ " 1339-10-16 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 0 | \n",
+ " Monday | \n",
+ " Monday | \n",
+ " 1000s | \n",
+ "
\n",
+ " \n",
+ " | 34466 | \n",
+ " 40035 | \n",
+ " Monday 1st Iyyar 1437 | \n",
+ " Seleucid | \n",
+ " 1126-04-26 | \n",
+ " 1126-04-26 | \n",
+ " 1437-02-01 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 0 | \n",
+ " Monday | \n",
+ " Monday | \n",
+ " 1100s | \n",
+ "
\n",
+ " \n",
+ " | 34467 | \n",
+ " 40036 | \n",
+ " Friday 15 of Adar 1443 | \n",
+ " Seleucid | \n",
+ " 1132-03-04 | \n",
+ " 1132-03-04 | \n",
+ " 1443-12-15 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 4 | \n",
+ " Friday | \n",
+ " Friday | \n",
+ " 1100s | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
104 rows × 12 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pgpid doc_date_original doc_date_calendar \\\n",
+ "851 1377 Wednesday night, 28 Sivan 1581 Seleucid \n",
+ "1714 2418 Monday 20 Tevet 1520 Seleucid \n",
+ "1929 2649 Sunday night, 25 Kislev 1444 Seleucid \n",
+ "2013 2739 Wednesday 29th Elul 1354 Seleucid \n",
+ "3257 4026 Wednesday night, 29 Tishrei 1541 Seleucid \n",
+ "... ... ... ... \n",
+ "29303 34623 Sunday night, 20 Ṭevet 1578 Seleucid \n",
+ "29924 35264 Wednesday 13 Ṭevet 1526 Seleucid \n",
+ "34008 39564 Monday 16 Tevet 1339 Seleucid \n",
+ "34466 40035 Monday 1st Iyyar 1437 Seleucid \n",
+ "34467 40036 Friday 15 of Adar 1443 Seleucid \n",
+ "\n",
+ " doc_date_standard undate_standard undate_orig orig_date_precision \\\n",
+ "851 1270 1270 1581-03-28 day \n",
+ "1714 1208-12-29 1208-12-29 1520-10-20 day \n",
+ "1929 1133 1133 1444-09-25 day \n",
+ "2013 1043-09-07 1043-09-07 1354-06-29 day \n",
+ "3257 1229-09-18 1229-09-18 1541-07-29 day \n",
+ "... ... ... ... ... \n",
+ "29303 1266/1267 1266/1267 1578-10-20 day \n",
+ "29924 1214/1215 1214/1215 1526-10-13 day \n",
+ "34008 1027-12-18 1027-12-18 1339-10-16 day \n",
+ "34466 1126-04-26 1126-04-26 1437-02-01 day \n",
+ "34467 1132-03-04 1132-03-04 1443-12-15 day \n",
+ "\n",
+ " type undate_weekday undate_weekday_name orig_weekday century \n",
+ "851 Legal document 3 Thursday Thursday 1200s \n",
+ "1714 Legal document 0 Monday Monday 1200s \n",
+ "1929 Legal document 0 Monday Monday 1100s \n",
+ "2013 Legal document 2 Wednesday Wednesday 1000s \n",
+ "3257 Legal document 3 Thursday Thursday 1200s \n",
+ "... ... ... ... ... ... \n",
+ "29303 Legal document 0 Monday Monday 1200s \n",
+ "29924 Legal document 2 Wednesday Wednesday 1200s \n",
+ "34008 Legal document 0 Monday Monday 1000s \n",
+ "34466 Legal document 0 Monday Monday 1100s \n",
+ "34467 Legal document 4 Friday Friday 1100s \n",
+ "\n",
+ "[104 rows x 12 columns]"
+ ]
+ },
+ "execution_count": 42,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# get rough century (gregorian calendar)\n",
+ "weekday_dates['century'] = orig_dates_days.undate_orig.apply(lambda x: (\"%04d\" % x.earliest.year)[:2] + \"00s\")\n",
+ "\n",
+ "weekday_dates[['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard', 'undate_standard', 'undate_orig', 'century']].head()\n",
+ "weekday_dates"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "id": "eb99871e-d9a5-4211-9bd2-5a9acfe8face",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.Chart(...)"
+ ]
+ },
+ "execution_count": 38,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "\n",
+ "alt.Chart(weekday_dates[['undate_weekday', 'undate_weekday_name', 'pgpid', 'century']]).mark_rect().encode(\n",
+ " alt.X('undate_weekday_name', sort=days, title='weekday'),\n",
+ " alt.Y('century'),\n",
+ " alt.Color('count(pgpid)')\n",
+ ").properties(title='document frequency by weekday and century')\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cfd1e93b-1286-43d9-be44-34ba607435e1",
+ "metadata": {},
+ "source": [
+ "The weekday + century heatmap suggets we're more likely to have day-level precision dates from the 1700s than any other time period in the dataset."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2ec7d437-092f-47de-b60c-a1b72f45b4dd",
+ "metadata": {},
+ "source": [
+ "## Plot frequency by month and calendar"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "id": "08a58fcf-2b08-441b-9dc8-385bafeb88e6",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.FacetChart(...)"
+ ]
+ },
+ "execution_count": 39,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# what about heat map by month?\n",
+ "\n",
+ "# get numeric month\n",
+ "orig_dates_parsed['undate_month'] = orig_dates_parsed.undate_orig.apply(lambda x: x.month)\n",
+ "# orig_dates_parsed['undate_weekday_name'] = orig_dates_parsed.undate_weekday.apply(lambda x: days[x])\n",
+ "\n",
+ "has_month = orig_dates_parsed[orig_dates_parsed.undate_month.notna()]\n",
+ "\n",
+ "alt.Chart(has_month[['undate_month', 'pgpid', 'doc_date_calendar']]).mark_rect().encode(\n",
+ " alt.X('undate_month', title='month'),\n",
+ " alt.Color('count(pgpid)', title='# of documents')\n",
+ ").facet(\n",
+ " row=alt.Facet('doc_date_calendar', title=\"Original Calendar\")\n",
+ ").properties(title='Document frequency by month and calendar')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2ad489d5-483d-4280-a7d8-0090fdd2aa32",
+ "metadata": {},
+ "source": [
+ "That very light month 13 in the Hebrew and Seleucid calendars reflects the fact that the Hebrew calendar has a leap _month_."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "id": "a7a16c53-6f01-4457-9458-4fcf80a35c51",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "doc_date_calendar\n",
+ "Seleucid 1196\n",
+ "Anno Mundi 903\n",
+ "Hijrī 516\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 40,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "has_month.doc_date_calendar.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "id": "65bce74e-67b7-48df-9f7f-a6f264af4f11",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1593, 38)"
+ ]
+ },
+ "execution_count": 41,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "orig_dates_days[orig_dates_days.undate_weekday_name.notna()].shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "id": "ac940883-e00e-4dde-8339-95a1b733f6f3",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/var/folders/mb/6qm4h4yx3yqdy2bv2sjyp4z00000gp/T/ipykernel_69693/2787254306.py:3: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " orig_dates_days['undate_month'] = orig_dates_days.undate_orig.apply(lambda x: x.month)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.FacetChart(...)"
+ ]
+ },
+ "execution_count": 42,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# weekday frequency by month?\n",
+ "\n",
+ "orig_dates_days['undate_month'] = orig_dates_days.undate_orig.apply(lambda x: x.month)\n",
+ "\n",
+ "alt.Chart(orig_dates_days[['undate_weekday', 'undate_weekday_name', 'pgpid', 'undate_month', 'doc_date_calendar']]).mark_rect().encode(\n",
+ " alt.X('undate_weekday_name', sort=days, title='weekday'),\n",
+ " alt.Y('undate_month', title=\"month\"),\n",
+ " alt.Color('count(pgpid)')\n",
+ ").facet(\n",
+ " column=alt.Facet('doc_date_calendar', title=\"Original Calendar\")\n",
+ ").properties(title='Document frequency by weekday and month (1,557 documents)')\n"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.7"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/pyproject.toml b/pyproject.toml
index 2dc6515..ef2fe99 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -49,7 +49,7 @@ classifiers = [
[project.optional-dependencies]
docs = ["sphinx>=7.0.0", "alabaster", "myst-parser", "myst-parser[linkify]"]
test = ["pytest>=7.2", "pytest-ordering", "pytest-cov"]
-notebooks = ["jupyterlab", "pandas", "treon"]
+notebooks = ["jupyterlab", "pandas", "treon", "altair"]
check = ["undate[docs]", "undate[notebooks]", "mypy", "ruff"]
dev = [
"pre-commit>=2.20.0",
diff --git a/src/undate/converters/calendars/__init__.py b/src/undate/converters/calendars/__init__.py
index a43a270..5836b2f 100644
--- a/src/undate/converters/calendars/__init__.py
+++ b/src/undate/converters/calendars/__init__.py
@@ -1,5 +1,11 @@
from undate.converters.calendars.gregorian import GregorianDateConverter
from undate.converters.calendars.hebrew import HebrewDateConverter
from undate.converters.calendars.islamic import IslamicDateConverter
+from undate.converters.calendars.seleucid import SeleucidDateConverter
-__all__ = ["GregorianDateConverter", "HebrewDateConverter", "IslamicDateConverter"]
+__all__ = [
+ "GregorianDateConverter",
+ "HebrewDateConverter",
+ "IslamicDateConverter",
+ "SeleucidDateConverter",
+]
diff --git a/src/undate/converters/calendars/hebrew/hebrew.lark b/src/undate/converters/calendars/hebrew/hebrew.lark
index b55ec3f..6f4244c 100644
--- a/src/undate/converters/calendars/hebrew/hebrew.lark
+++ b/src/undate/converters/calendars/hebrew/hebrew.lark
@@ -3,7 +3,7 @@
// only support day month year format for now
// parser requires numeric day and year to be distinguished based on order
-hebrew_date: day month year | month year | year
+hebrew_date: weekday? day month comma? year | month year | year
// TODO: handle date ranges?
@@ -27,10 +27,14 @@ month: month_1
| month_10
| month_11
| month_12
- | month_13
+ | month_13
// months have 29 or 30 days; we do not expect leading zeroes
day: /[1-9]/ | /[12][0-9]/ | /30/
+comma: ","
+weekday: ("Monday" | "Tuesday" | "Wednesday" | "Thursday" | "Friday" | "Saturday" | "Sunday") comma?
+
+
// months, in order; from convertdate list
// with variants from Princeton Geniza Project
// support matching with and without accents
@@ -43,11 +47,13 @@ month_5: "Av"
month_6: "Elul"
// Tishrei or Tishri
month_7: /Tishre?i/
-month_8: "Heshvan"
+// Heshvan, Ḥeshvan, Marḥeshvan
+month_8: /(Mar)?[ḤHḥ]eshvan/
month_9: "Kislev"
// Tevet or Teveth
month_10: /[ṬT]eveth?/
-month_11: "Shevat"
+// Shevat or Shevaṭ
+month_11: /Sheva[tṭ]/
// Adar I or Adar
month_12: /Adar( I)?/
// Adar II or Adar Bet
diff --git a/src/undate/converters/calendars/hebrew/transformer.py b/src/undate/converters/calendars/hebrew/transformer.py
index 48e8b20..8880434 100644
--- a/src/undate/converters/calendars/hebrew/transformer.py
+++ b/src/undate/converters/calendars/hebrew/transformer.py
@@ -13,6 +13,8 @@ class HebrewDateTransformer(Transformer):
"""Transform a Hebrew date parse tree and return an Undate or
UndateInterval."""
+ calendar = Calendar.HEBREW
+
def hebrew_date(self, items):
parts = {}
for child in items:
@@ -22,9 +24,9 @@ def hebrew_date(self, items):
value = int(child.children[0])
parts[str(child.data)] = value
- # initialize and return an undate with islamic year, month, day and
- # islamic calendar
- return HebrewUndate(**parts)
+ # initialize and return an undate with year, month, day and
+ # configured calendar (hebrew by default)
+ return Undate(**parts, calendar=self.calendar)
# year translation is not needed since we want a tree with name year
# this is equivalent to a no-op
diff --git a/src/undate/converters/calendars/islamic/islamic.lark b/src/undate/converters/calendars/islamic/islamic.lark
index 3ad59a5..1e4940b 100644
--- a/src/undate/converters/calendars/islamic/islamic.lark
+++ b/src/undate/converters/calendars/islamic/islamic.lark
@@ -3,7 +3,7 @@
// only support day month year format for now
// parser requires numeric day and year to be distinguished based on order
-islamic_date: day month year | month year | year
+islamic_date: weekday? day month year | month year | year
// TODO: handle date ranges?
@@ -13,6 +13,7 @@ islamic_date: day month year | month year | year
year: /\d+/
+
// months
month: month_1
| month_2
@@ -29,6 +30,10 @@ month: month_1
// months have 29 or 30 days; we do not expect leading zeroes
day: /[1-9]/ | /[12][0-9]/ | /30/
+
+comma: ","
+weekday: ("Monday" | "Tuesday" | "Wednesday" | "Thursday" | "Friday" | "Saturday" | "Sunday") comma?
+
// months, in order; from convertdate list
// with variants from Princeton Geniza Project
// support matching with and without accents
@@ -42,7 +47,7 @@ month_4: /Rab[īi][ʿ'] (ath-Th[āa]n[īi]|II)/
// Jumādā al-ʾAwwal or Jumādā I
month_5: /Jum[āa]d[āa] (al-[ʾ`]Awwal|I)/
// Jumādā ath-Thāniya or Jumādā II
-month_6: /Jum[āa]d[āa] (ath-Th[āa]niyah|II)/
+month_6: /Jum[āa][dḍ][āa] (ath-Th[āa]niyah|II)/
month_7: "Rajab"
// Shaʿbān
month_8: /Sha[ʿ']b[āa]n/
diff --git a/src/undate/converters/calendars/seleucid.py b/src/undate/converters/calendars/seleucid.py
new file mode 100644
index 0000000..bddf867
--- /dev/null
+++ b/src/undate/converters/calendars/seleucid.py
@@ -0,0 +1,24 @@
+from undate.converters.calendars import HebrewDateConverter
+from undate.undate import Calendar
+
+
+class SeleucidDateConverter(HebrewDateConverter):
+ #: offset for Seleucid calendar: Seleucid year + 3449 = Anno Mundi year
+ SELEUCID_OFFSET = 3449
+
+ #: converter name: Seleucid
+ name: str = "Seleucid"
+ calendar_name: str = "Seleucid"
+
+ def __init__(self):
+ super().__init__()
+ # override hebrew calendar to initialize undates with seleucid
+ # calendar; this triggers Seleucid calendar to_gregorian method use
+ self.transformer.calendar = Calendar.SELEUCID
+
+ def to_gregorian(self, year: int, month: int, day: int) -> tuple[int, int, int]:
+ """Convert a Seleucid date, specified by year, month, and day,
+ to the Gregorian equivalent date. Uses hebrew calendar conversion
+ logic with :attr:`SELEUCID_OFFSET`. Returns a tuple of year, month, day.
+ """
+ return super().to_gregorian(year + self.SELEUCID_OFFSET, month, day)
diff --git a/src/undate/date.py b/src/undate/date.py
index 27f6efa..c953995 100644
--- a/src/undate/date.py
+++ b/src/undate/date.py
@@ -104,6 +104,27 @@ def day(self) -> Optional[int]:
return int(str(self.astype("datetime64[D]")).split("-")[-1])
return None
+ @property
+ def weekday(self) -> Optional[int]:
+ """Equivalent to :meth:`datetime.date.weekday`; returns day of week as an
+ integer where Monday is 0 and Sunday is 6. Only supported for dates
+ with date unit in days.
+ """
+ # only return a weekday if date unit is in days
+ if self.dtype == "datetime64[D]":
+ # calculate based on difference between current day and week start
+ # numpy datetime weeks start on thursdays - presumably since
+ # unix epoch day zero was a thursday...
+
+ # implementation inspired in part by https://stackoverflow.com/a/54264187
+
+ thursday_week = self.astype("datetime64[W]")
+ days_from_thursday = (self - thursday_week).astype(int)
+ # if monday is 0, thursday is 3
+ return (days_from_thursday + 3) % 7
+
+ return None
+
def __sub__(self, other):
# modify to conditionally return a timedelta object instead of a
# Date object with dtype timedelta64[D] (default behavior)
diff --git a/src/undate/undate.py b/src/undate/undate.py
index be4454a..dc4d506 100644
--- a/src/undate/undate.py
+++ b/src/undate/undate.py
@@ -29,6 +29,7 @@ class Calendar(StrEnum):
GREGORIAN = auto()
HEBREW = auto()
ISLAMIC = auto()
+ SELEUCID = auto()
@staticmethod
def get_converter(calendar):
@@ -96,7 +97,6 @@ def __init__(
if calendar is not None:
self.set_calendar(calendar)
self.calendar_converter = Calendar.get_converter(self.calendar)
-
self.calculate_earliest_latest(year, month, day)
if converter is None:
@@ -192,9 +192,12 @@ def calculate_earliest_latest(self, year, month, day):
)
def set_calendar(self, calendar: Union[str, Calendar]):
+ """Find calendar by name if passed as string and set on the object.
+ Only intended for use at initialization time; use :meth:`as_calendar`
+ to change calendar."""
if calendar is not None:
# if not passed as a Calendar instance, do a lookup
- if not isinstance(calendar, Calendar):
+ if isinstance(calendar, str):
# look for calendar by upper-case name
try:
calendar = Calendar[calendar.upper()]
@@ -202,6 +205,19 @@ def set_calendar(self, calendar: Union[str, Calendar]):
raise ValueError(f"Calendar `{calendar}` is not supported") from err
self.calendar = calendar
+ def as_calendar(self, calendar: Union[str, Calendar]):
+ """Return a new :class:`Undate` object with the same year, month, day, and labels
+ used to initialize the current object, but with a different calendar. Note that this
+ does NOT do calendar conversion, but reinterprets current numeric year, month, day values
+ according to the new calendar."""
+ return Undate(
+ year=self.initial_values.get("year"),
+ month=self.initial_values.get("month"),
+ day=self.initial_values.get("day"),
+ label=self.label,
+ calendar=calendar,
+ )
+
def __str__(self) -> str:
# if any portion of the date is partially known, construct
# pseudo ISO8601 format here, since ISO8601 doesn't support unknown digits
@@ -319,8 +335,12 @@ def __lt__(self, other: object) -> bool:
# (e.g., single date within the same year)
# comparison for those cases is not currently supported
elif other in self or self in other:
+ # sort by precision, most precise first
+ by_precision = sorted(
+ [self, other], key=lambda x: x.precision, reverse=True
+ )
raise NotImplementedError(
- "Can't compare when one date falls within the other"
+ f"Can't compare when one date ({by_precision[0]}) falls within the other ({by_precision[1]})"
)
# NOTE: unsupported comparisons are supposed to return NotImplemented
# However, doing that in this case results in a confusing TypeError!
@@ -405,7 +425,7 @@ def year(self) -> Optional[str]:
year = self._get_date_part("year")
if year:
return f"{year:0>4}"
- # if value is unset but date precision is month or greater, return unknown month
+ # if value is unset but date precision is year or greater, return unknown year
elif self.precision >= DatePrecision.YEAR:
return self.MISSING_DIGIT * 4
return None
diff --git a/tests/test_converters/test_calendars/test_hebrew/test_hebrew_transformer.py b/tests/test_converters/test_calendars/test_hebrew/test_hebrew_transformer.py
index 6e4a5e6..7dcca83 100644
--- a/tests/test_converters/test_calendars/test_hebrew/test_hebrew_transformer.py
+++ b/tests/test_converters/test_calendars/test_hebrew/test_hebrew_transformer.py
@@ -26,6 +26,12 @@ def test_hebrew_undate():
("5362", HebrewUndate(5362), DatePrecision.YEAR),
# add when we support parsing ranges:
# Adar I and Adar II 5453 : (1693 CE)
+ # support weekdays included in text
+ ("Thursday, 12 Sivan 4795", HebrewUndate(4795, 3, 12), DatePrecision.DAY),
+ # with or without comma
+ ("Thursday 12 Sivan 4795", HebrewUndate(4795, 3, 12), DatePrecision.DAY),
+ # huh, current parsing completely ignores whitespace; do we want that?
+ ("Thursday12Sivan4795", HebrewUndate(4795, 3, 12), DatePrecision.DAY),
]
diff --git a/tests/test_converters/test_calendars/test_islamic/test_islamic_transformer.py b/tests/test_converters/test_calendars/test_islamic/test_islamic_transformer.py
index 951a9f8..04ff53b 100644
--- a/tests/test_converters/test_calendars/test_islamic/test_islamic_transformer.py
+++ b/tests/test_converters/test_calendars/test_islamic/test_islamic_transformer.py
@@ -28,6 +28,7 @@ def test_islamic_undate():
# examples from ISMI data (reformatted to day month year)
# Rabi 1 = month 3
("14 Rabīʿ I 901", IslamicUndate(901, 3, 14), DatePrecision.DAY),
+ ("Rabīʿ I 490", IslamicUndate(490, 3), DatePrecision.MONTH),
("884", IslamicUndate(884), DatePrecision.YEAR),
# Gregorian: UndateInterval(Undate(1479, 4, 3), Undate(1480, 3, 21)),
# add when we support parsing ranges:
diff --git a/tests/test_converters/test_calendars/test_seleucid.py b/tests/test_converters/test_calendars/test_seleucid.py
new file mode 100644
index 0000000..fd8bc82
--- /dev/null
+++ b/tests/test_converters/test_calendars/test_seleucid.py
@@ -0,0 +1,109 @@
+from undate.converters.calendars import SeleucidDateConverter
+from undate.date import Date, DatePrecision
+from undate.undate import Calendar, Undate
+
+
+class TestSeleucidDateConverter:
+ def test_parse(self):
+ # day
+ # Elul = month 6; 11 September, 1000 Gregorian
+ date_str = "29 Elul 1311"
+ date = SeleucidDateConverter().parse(date_str)
+ assert date == Undate(1311, 6, 29, calendar="Seleucid")
+ assert date.calendar == Calendar.SELEUCID
+ assert date.precision == DatePrecision.DAY
+ assert date.label == f"{date_str} {SeleucidDateConverter.calendar_name}"
+
+ date_str = "Tishri 1458" # month 7
+ date = SeleucidDateConverter().parse(date_str)
+ assert date == Undate(1458, 7, calendar="Seleucid")
+ assert date.calendar == Calendar.SELEUCID
+ assert date.precision == DatePrecision.MONTH
+ assert date.label == f"{date_str} {SeleucidDateConverter.calendar_name}"
+
+ # year
+ date_str = "1458"
+ date = SeleucidDateConverter().parse(date_str)
+ assert date == Undate(1458, calendar="Seleucid")
+ assert date.calendar == Calendar.SELEUCID
+ assert date.precision == DatePrecision.YEAR
+ assert date.label == f"{date_str} {SeleucidDateConverter.calendar_name}"
+
+ def test_gregorian_earliest_latest(self):
+ # earliest/latest should be converted to Gregorian for comparison
+
+ # full date
+ # Elul = month 6 (7 September, 1000 Gregorian)
+ date_str = "29 Elul 1311"
+ date = SeleucidDateConverter().parse(date_str)
+ assert date.earliest == Date(1000, 9, 7)
+ assert date.latest == Date(1000, 9, 7)
+ assert date.label == f"{date_str} {SeleucidDateConverter.calendar_name}"
+
+ date_str = "23 Adar I 1475"
+ date = SeleucidDateConverter().parse(date_str)
+ assert date.earliest == Date(1164, 2, 25)
+ assert date.latest == Date(1164, 2, 25)
+ assert date.label == f"{date_str} {SeleucidDateConverter.calendar_name}"
+
+ # month/year
+ date_str = "Tishri 1458"
+ date = SeleucidDateConverter().parse(date_str)
+ assert date.earliest == Date(1146, 9, 16)
+ assert date.latest == Date(1146, 10, 15)
+ assert date.label == f"{date_str} {SeleucidDateConverter.calendar_name}"
+
+
+# TODO: update validation error to say seleucid instead of hebrew
+
+# seleucid_year = 1458
+# converted_date = convert_seleucid_date(f"Tishri {seleucid_year}")
+# converted_date_am = convert_hebrew_date(
+# f"Tishrei {seleucid_year + Calendar.SELEUCID_OFFSET}"
+# )
+# # the converted date range for Tishri Sel. should be the same as that for Tishri AM - 3449 years.
+# assert converted_date[0] == converted_date_am[0]
+# assert converted_date[1] == converted_date_am[1]
+
+# # leap day (Feb 29, 2020) should convert properlyd
+# converted_date = convert_seleucid_date("4 Adar 2331")
+# assert converted_date[1] == date(2020, 2, 29)
+
+
+# # 26 Tammuz 4816: 17 July, 1056; Tammuz = month 4
+# date = Undate(4816, 4, 26, calendar="Seleucid")
+# assert date.earliest == Date(1056, 7, 17)
+# assert date.latest == Date(1056, 7, 17)
+# # 13 Tishrei 5416 Anno Mundi (1655-10-14)
+# date = Undate(5416, 7, 13, calendar="Seleucid") # Tishrei = month 7
+# assert date.earliest == Date(1655, 10, 14)
+# assert date.latest == Date(1655, 10, 14)
+
+
+# from pgp tests
+
+
+# # month/year
+# seleucid_year = 1458
+# converted_date = convert_seleucid_date(f"Tishri {seleucid_year}")
+# converted_date_am = convert_hebrew_date(
+# f"Tishrei {seleucid_year + Calendar.SELEUCID_OFFSET}"
+# )
+# # the converted date range for Tishri Sel. should be the same as that for Tishri AM - 3449 years.
+# assert converted_date[0] == converted_date_am[0]
+# assert converted_date[1] == converted_date_am[1]
+
+# # leap day (Feb 29, 2020) should convert properly
+# converted_date = convert_seleucid_date("4 Adar 2331")
+# assert converted_date[1] == date(2020, 2, 29)
+
+# # leap year (4826 AM = 1377 Seleucid) should convert properly
+# seleucid_year = 1377
+# converted_date = convert_seleucid_date(f"21 Adar II {seleucid_year}")
+# converted_date_am = convert_hebrew_date(
+# f"21 Adar II {seleucid_year + Calendar.SELEUCID_OFFSET}"
+# )
+# assert converted_date[0] == converted_date_am[0]
+# assert converted_date[1] == converted_date_am[1]
+# # and it should be converted to 1066-03-21 CE
+# assert converted_date[1] == date(1066, 3, 21)
diff --git a/tests/test_date.py b/tests/test_date.py
index 5ff017d..d5c7d7b 100644
--- a/tests/test_date.py
+++ b/tests/test_date.py
@@ -1,3 +1,5 @@
+import datetime
+
import numpy as np
from undate.date import ONE_YEAR, Date, DatePrecision, Timedelta
@@ -51,6 +53,31 @@ def test_properties_day(self):
assert Date(2010, 5).day is None
assert Date(2021, 6, 15).day == 15
+ def test_weekday(self):
+ # thursday
+ assert Date(2025, 1, 2).weekday == 3
+ assert Date(2025, 1, 2).weekday == datetime.date(2025, 1, 2).weekday()
+ # friday
+ assert Date(2025, 1, 3).weekday == 4
+ assert Date(2025, 1, 3).weekday == datetime.date(2025, 1, 3).weekday()
+ # saturday
+ assert Date(2025, 1, 4).weekday == 5
+ assert Date(2025, 1, 4).weekday == datetime.date(2025, 1, 4).weekday()
+ # sunday
+ assert Date(2025, 1, 5).weekday == 6
+ assert Date(2025, 1, 5).weekday == datetime.date(2025, 1, 5).weekday()
+ # monday
+ assert Date(2025, 1, 6).weekday == 0
+ assert Date(2025, 1, 6).weekday == datetime.date(2025, 1, 6).weekday()
+ # tuesday
+ assert Date(2025, 1, 7).weekday == 1
+ assert Date(2025, 1, 7).weekday == datetime.date(2025, 1, 7).weekday()
+
+ # when a date is not day-level precision, no weekday is returned
+ yearonly_date = Date(2025)
+ assert yearonly_date.dtype == "datetime64[Y]"
+ assert yearonly_date.weekday is None
+
def test_substract(self):
# date - date = timedelta
date_difference = Date(2024, 1, 2) - Date(2024, 1, 1)
diff --git a/tests/test_undate.py b/tests/test_undate.py
index 18e03b0..16ea08c 100644
--- a/tests/test_undate.py
+++ b/tests/test_undate.py
@@ -130,6 +130,16 @@ def test_calendar(self):
with pytest.raises(ValueError, match="Calendar `foobar` is not supported"):
Undate(848, calendar="foobar")
+ def test_as_calendar(self):
+ # changes calendar *without* converting dates
+ assert Undate(1243, 5, 7).as_calendar(Calendar.ISLAMIC) == Undate(
+ 1243, 5, 7, calendar=Calendar.ISLAMIC
+ )
+ # should also work with string
+ assert Undate(1243, 5, 7).as_calendar("islamic") == Undate(
+ 1243, 5, 7, calendar=Calendar.ISLAMIC
+ )
+
def test_init_invalid(self):
with pytest.raises(ValueError):
Undate("19??")
@@ -298,11 +308,17 @@ def test_lt_notimplemented(self):
# how to compare mixed precision where dates overlap?
# if the second date falls *within* earliest/latest,
# then it is not clearly less; not implemented?
- with pytest.raises(NotImplementedError, match="date falls within the other"):
+ with pytest.raises(
+ NotImplementedError,
+ match="one date \\(2022-05\\) falls within the other \\(2022\\)",
+ ):
assert Undate(2022) < Undate(2022, 5)
# same if we attempt to compare in the other direction
- with pytest.raises(NotImplementedError, match="date falls within the other"):
+ with pytest.raises(
+ NotImplementedError,
+ match="one date \\(2022-05\\) falls within the other \\(2022\\)",
+ ):
assert Undate(2022, 5) < Undate(2022)
testdata_contains = [