diff --git a/.all-contributorsrc b/.all-contributorsrc
index f4f31f1..2d37c9b 100644
--- a/.all-contributorsrc
+++ b/.all-contributorsrc
@@ -68,6 +68,16 @@
"test",
"doc"
]
+ },
+ {
+ "login": "taylor-arnold",
+ "name": "Taylor Arnold",
+ "avatar_url": "https://avatars.githubusercontent.com/u/5752184?v=4",
+ "profile": "http://taylorarnold.org",
+ "contributions": [
+ "review",
+ "ideas"
+ ]
}
],
"contributorsPerLine": 7,
@@ -77,5 +87,6 @@
"repoHost": "https://github.com",
"projectName": "undate-python",
"projectOwner": "dh-tech",
- "badgeTemplate": ""
+ "badgeTemplate": "",
+ "commitType": "docs"
}
diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml
index ae450b4..9a373b6 100644
--- a/.github/workflows/check.yml
+++ b/.github/workflows/check.yml
@@ -3,6 +3,9 @@ name: Check style + docs + types
on:
pull_request:
+permissions:
+ contents: read
+
jobs:
check:
runs-on: ubuntu-latest
diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index 31e01ed..3a05faf 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -1,5 +1,9 @@
name: unit tests
+permissions:
+ contents: read
+ id-token: write
+
on:
push:
branches:
@@ -33,26 +37,25 @@ jobs:
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python }}
+ cache: 'pip'
+ cache-dependency-path: '**/pyproject.toml'
- - name: Install uv
- uses: astral-sh/setup-uv@v5
- with:
- enable-cache: true
- cache-dependency-glob: "pyproject.toml"
-
- - name: Install package with dev and test dependencies
- run: uv sync --extra test
+ - name: Install package with dependencies
+ run: pip install -e ".[test]"
# for all versions but the one we use for code coverage, run normally
- - name: Run unit tests normally
- run: uv run pytest
+ - name: Run unit tests without code coverage
+ run: pytest
if: ${{ matrix.python != env.COV_PYTHON_VERSION }}
# run code coverage in one version only
- name: Run unit tests with code coverage reporting
- run: uv run pytest --cov=undate
+ run: pytest --cov=.
if: ${{ matrix.python == env.COV_PYTHON_VERSION }}
- - name: Upload test coverage to Codecov
- uses: codecov/codecov-action@v3
+
+ - name: Upload coverage to Codecov
+ uses: codecov/codecov-action@v4
+ env:
+ CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
if: ${{ matrix.python == env.COV_PYTHON_VERSION }}
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 278df82..8b0a307 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,21 @@
# Change Log
+## 0.5
+
+- New `UnDelta` and `UnInt` classes for uncertain durations
+ - `Undate.duration` now returns either a `Timedelta` or an `UnDelta` if the duration is ambiguous
+- New properties `possible_years` and `representative_years` on `Undate` class, used for calculating durations for uncertain years and months
+- New `weekday` method on class `undate.date.Date`
+- Calendar converter improvements:
+ - Calendar converter classes can optional provide minimum and maximum years for uncertain dates
+ - New calendar methods `days_in_year` and `representative_years`
+ - Hebrew date parser now allows for week days, along with additional month variants
+ - Preliminary Seleucide calendar converter class, based on Hebrew calendar with a year offset
+ - New method `as_calendar` on `Undate` class, to set calendar without doing any conversion
+- Readme examples have been improved and extended
+- New example notebook testing Hebrew, Islamic, and Seleucid date parsing and conversion with Princeton Geniza Project data
+- bugfix: duration for uncertain years previously returned the duration from earliest to latest possible dates in range; now returns an `UnDelta` with the possible durations for the possible years in the given calendar
+
## 0.4
- Undate is now Calendar aware / Calendar explicit; default is Gregorian
@@ -20,7 +36,6 @@
- Reorganized examples folder to avoid unnecessary nesting
- ISMI data has been updated from older JSON data to examples in RDF (turtle)
-
## 0.3.1
Update readthedocs config for current installation
@@ -49,7 +64,7 @@ Update readthedocs config for current installation
### numpy impact
-Performance differences seem to be negligible, but it does increase payload size. The virtualenv for installing version 0.2 was 14MB; when installing the newer version with numpy, the virtualenv is 46MB (the numpy folder in site packages is 31MB on its own).
+Performance differences seem to be negligible, but it does increase payload size. The virtualenv for installing version 0.2 was 14MB; when installing the newer version with numpy, the virtualenv is 46MB (the numpy folder in site packages is 31MB on its own).
## 0.2
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 415e17b..9180e0b 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -19,6 +19,7 @@ We use [All Contributors](https://allcontributors.org/) because we recognize tha
 Robert Casties 🔣 🤔 👀 |
 Julia Damerow 💻 👀 ⚠️ 📋 🤔 |
 Malte Vogl 💻 👀 ⚠️ 📖 |
+  Taylor Arnold 👀 🤔 |
diff --git a/README.md b/README.md
index c6e3560..cd73453 100644
--- a/README.md
+++ b/README.md
@@ -5,13 +5,13 @@
**undate** is a python library for working with uncertain or partially known dates.
> [!WARNING]
-> This is alpha software and is not yet feature complete! Use with caution and give us feedback.
-> Currently `undate` supports parsing and formatting dates in ISO8601, some
-portions of EDTF (Extended Date Time Format), and parsing and conversion for dates in Hebrew Anno Mundi and Islamic Hijri calendars
+> This is beta software and is not yet feature complete! Use with caution and give us feedback.
+> Currently `undate` supports parsing and formatting dates in ISO8601, some
+> portions of EDTF (Extended Date Time Format), and parsing and conversion for dates in Hebrew Anno Mundi and Islamic Hijri calendars.
-*Undate was initially created as part of a [DH-Tech](https://dh-tech.github.io/) hackathon in November 2022.*
+_Undate was initially created as part of a [DH-Tech](https://dh-tech.github.io/) hackathon in November 2022._
-* * *
+---
[](https://doi.org/10.5281/zenodo.11068867)
[](https://opensource.org/licenses/Apache-2.0)
@@ -19,20 +19,46 @@ portions of EDTF (Extended Date Time Format), and parsing and conversion for dat
[](https://github.com/dh-tech/undate-python/actions/workflows/unit_tests.yml)
[](https://codecov.io/gh/dh-tech/undate-python)
[](https://github.com/astral-sh/ruff)
+
+
[](CONTRIBUTORS.md)
+
Read [Contributors](CONTRIBUTORS.md) for detailed contribution information.
+## Installation
+
+_Recommended_: use pip to install the latest published version from PyPI:
+
+```console
+pip install undate
+```
+
+To install a development version or specific tag or branch, you can install from GitHub.
+Use the `@name` notation to specify the branch or tag; e.g., to install development version:
+
+```console
+pip install git+https://github.com/dh-tech/undate-python@develop#egg=undate
+```
+
## Example Usage
-Often humanities and cultural data include imprecise or uncertain temporal information. We want to store that information but also work with it in a structured way, not just treat it as text for display. Different projects may need to work with or convert between different date formats or even different calendars.
+Often humanities and cultural data include imprecise or uncertain
+temporal information. We want to store that information but also work
+with it in a structured way, not just treat it as text for display.
+Different projects may need to work with or convert between different
+date formats or even different calendars.
-An `undate.Undate` is analogous to python’s builtin `datetime.date` object, but with support for varying degrees of precision and unknown information. You can initialize an undate with either strings or numbers for whichever parts of the date are known or partially known. An `Undate` can take an optional label.
+An `undate.Undate` is analogous to python’s builtin `datetime.date`
+object, but with support for varying degrees of precision and unknown
+information. You can initialize an `Undate` with either strings or
+numbers for whichever parts of the date are known or partially known.
+An `Undate` can take an optional label.
```python
-from undate.undate import Undate
+from undate import Undate
november7 = Undate(2000, 11, 7)
november = Undate(2000, 11)
@@ -46,12 +72,14 @@ easter1916 = Undate(1916, 4, 23, label="Easter 1916")
```
You can convert an `Undate` to string using a date formatter (current default is ISO8601):
+
```python
>>> [str(d) for d in [november7, november, year2k, november7_some_year]]
['2000-11-07', '2000-11', '2000', '--11-07']
```
If enough information is known, an `Undate` object can report on its duration:
+
```python
>>> december = Undate(2000, 12)
>>> feb_leapyear = Undate(2024, 2)
@@ -68,7 +96,9 @@ If enough information is known, an `Undate` object can report on its duration:
2024-02 - duration in days: 29
```
-If enough of the date is known and the precision supports it, you can check if one date falls within another date:
+If enough of the date is known and the precision supports it, you can
+check if one date falls within another date:
+
```python
>>> november7 = Undate(2000, 11, 7)
>>> november2000 = Undate(2000, 11)
@@ -86,7 +116,10 @@ False
False
```
-For dates that are imprecise or partially known, `undate` calculates earliest and latest possible dates for comparison purposes so you can sort dates and compare with equals, greater than, and less than. You can also compare with python `datetime.date` objects.
+For dates that are imprecise or partially known, `undate` calculates
+earliest and latest possible dates for comparison purposes so you can
+sort dates and compare with equals, greater than, and less than. You
+can also compare with python `datetime.date` objects.
```python
>>> november7_2020 = Undate(2020, 11, 7)
@@ -104,7 +137,8 @@ False
False
```
-When dates cannot be compared due to ambiguity or precision, comparison methods raise a `NotImplementedError`.
+When dates cannot be compared due to ambiguity or precision, comparison
+methods raise a `NotImplementedError`.
```python
>>> november_2020 = Undate(2020, 11)
@@ -118,17 +152,22 @@ Traceback (most recent call last):
NotImplementedError: Can't compare when one date falls within the other
```
-An `UndateInterval` is a date range between two `Undate` objects. Intervals can be open-ended, allow for optional labels, and can calculate duration if enough information is known
+An `UndateInterval` is a date range between two `Undate` objects.
+Intervals can be open-ended, allow for optional labels, and can
+calculate duration if enough information is known. `UndateIntervals`
+are inclusive (i.e., a closed interval), and include both the earliest
+and latest date as part of the range.
+
```python
->>> from undate.undate import UndateInterval
+>>> from undate import UndateInterval
>>> UndateInterval(Undate(1900), Undate(2000))
->>> UndateInterval(Undate(1900), Undate(2000), label="19th century")
->>> UndateInterval(Undate(1900), Undate(2000), label="19th century").duration().days
-36890
-
->>> UndateInterval(Undate(1900), Undate(2000), label="20th century")
-
+>>> UndateInterval(Undate(1801), Undate(1900), label="19th century")
+>>> UndateInterval(Undate(1801), Undate(1900), label="19th century").duration().days
+36524
+
+>>> UndateInterval(Undate(1901), Undate(2000), label="20th century")
+
>>> UndateInterval(latest=Undate(2000)) # before 2000
>>> UndateInterval(Undate(1900)) # after 1900
@@ -139,8 +178,10 @@ An `UndateInterval` is a date range between two `Undate` objects. Intervals can
31
```
-You can initialize `Undate` or `UndateInterval` objects by parsing a date string with a specific converter, and you can also output an `Undate` object in those formats.
-Currently available converters are "ISO8601" and "EDTF" and supported calendars.
+You can initialize `Undate` or `UndateInterval` objects by parsing a
+date string with a specific converter, and you can also output an
+`Undate` object in those formats. Currently available converters
+are "ISO8601" and "EDTF" and supported calendars.
```python
>>> from undate import Undate
@@ -158,9 +199,17 @@ Currently available converters are "ISO8601" and "EDTF" and supported calendars.
### Calendars
-All `Undate` objects are calendar aware, and date converters include support for parsing and working with dates from other calendars. The Gregorian calendar is used by default; currently `undate` supports the Islamic Hijri calendar and the Hebrew Anno Mundi calendar based on calendar conversion logic implemented in the [convertdate](https://convertdate.readthedocs.io/en/latest/) package.
+All `Undate` objects are calendar aware, and date converters include
+support for parsing and working with dates from other calendars. The
+Gregorian calendar is used by default; currently `undate` supports the
+Islamic Hijri calendar and the Hebrew Anno Mundi calendar based on
+calendar conversion logic implemented in the
+[convertdate](https://convertdate.readthedocs.io/en/latest/) package.
-Dates are stored with the year, month, day and appropriate precision for the original calendar; internally, earliest and latest dates are calculated in Gregorian / Proleptic Gregorian calendar for standardized comparison across dates from different calendars.
+Dates are stored with the year, month, day and appropriate precision for
+the original calendar; internally, earliest and latest dates are
+calculated in Gregorian / Proleptic Gregorian calendar for standardized
+comparison across dates from different calendars.
```python
>>> from undate import Undate
@@ -181,9 +230,11 @@ Dates are stored with the year, month, day and appropriate precision for the ori
[, , ]
```
-* * *
+---
-For more examples, refer to the code notebooks included in the [examples](https://github.com/dh-tech/undate-python/tree/main/examples/) in this repository.
+For more examples, refer to the code notebooks included in the[examples]
+(https://github.com/dh-tech/undate-python/tree/main/examples/) in this
+repository.
## Documentation
diff --git a/codecov.yml b/codecov.yml
new file mode 100644
index 0000000..ea08260
--- /dev/null
+++ b/codecov.yml
@@ -0,0 +1,14 @@
+coverage:
+ status:
+ project:
+ default: false # disable default status check
+ app: # custom target for app code
+ target: auto # default target (95%)
+ paths:
+ - "src/" # limit to files within src code directory
+ tests: # declare a new status context for "tests"
+ target: 100% # we always want 100% coverage here
+ paths:
+ - "tests/" # only include coverage in "tests/" folder
+
+
diff --git a/examples/pgp_dates.ipynb b/examples/pgp_dates.ipynb
new file mode 100644
index 0000000..43a858c
--- /dev/null
+++ b/examples/pgp_dates.ipynb
@@ -0,0 +1,4406 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "2d231f1e-3944-4579-b868-504f7fb2d543",
+ "metadata": {},
+ "source": [
+ "# Princeton Geniza Project\n",
+ "\n",
+ "This notebook demonstrates parsing dates from non-Gregorian calendars and working with mixed-calendar dates.\n",
+ "\n",
+ "This notebook uses document data from the [Princeton Geniza Project](https://geniza.princeton.edu/), which is a database of fragmentary medieval documents found in the Cairo Geniza. Documents are written largely in Hebrew script in Hebrew and Arabic languages, and use a range of calendars including: \n",
+ "- Hebrew _Anno Mundi_\n",
+ "- Islamic _Hijri_\n",
+ "- Hebrew Seleucid calendar (_Anno Mundi_ calendar with a 3449 year offset)\n",
+ "\n",
+ "The dataset includes original dates and standardized Common Era dates (Julian before 1583, Gregorian after).\n",
+ "\n",
+ "This notebook uses the data published on GitHub at https://github.com/princetongenizalab/pgp-metadata\n",
+ "\n",
+ "\n",
+ "*Notebook authored by Rebecca Sutton Koeser, 2025.*"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9d9da1cf-6cc6-4b6a-9baf-782152998d82",
+ "metadata": {},
+ "source": [
+ "## Load and filter data\n",
+ "\n",
+ "Limit to documents with authoritative \"date on document\" set in the metadata."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "67c5532d-ebc4-4e1e-aa64-e6802ed1d971",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "\n",
+ "pgp_documents_csv = \"https://github.com/princetongenizalab/pgp-metadata/raw/main/data/documents.csv\"\n",
+ "documents = pd.read_csv(pgp_documents_csv)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "41dc5a05-a04b-4b6d-acfe-1f7b04849346",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Total documents: 35,187\n",
+ "Documents with dates: 4,451\n",
+ " date on document: 4,126\n",
+ " inferred dating: 331\n"
+ ]
+ }
+ ],
+ "source": [
+ "# limit to documents with dates\n",
+ "docs_with_dates = documents[documents.doc_date_standard.notna() | documents.inferred_date_standard.notna()]\n",
+ "docs_with_docdate = documents[documents.doc_date_standard.notna()].copy()\n",
+ "docs_with_inferreddate = documents[documents.inferred_date_standard.notna()]\n",
+ "\n",
+ "print(f\"\"\"\n",
+ "Total documents: {len(documents):,}\n",
+ "Documents with dates: {len(docs_with_dates):,}\n",
+ " date on document: {len(docs_with_docdate):,}\n",
+ " inferred dating: {len(docs_with_inferreddate):,}\"\"\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "94d6340b-10d0-461b-b745-378ffa1ffcec",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pgpid | \n",
+ " doc_date_original | \n",
+ " doc_date_calendar | \n",
+ " doc_date_standard | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 5 | \n",
+ " 449 | \n",
+ " 1570 | \n",
+ " Seleucid | \n",
+ " 1259 | \n",
+ "
\n",
+ " \n",
+ " | 16 | \n",
+ " 463 | \n",
+ " 19 Adar 1427 | \n",
+ " Seleucid | \n",
+ " 1116-03-05 | \n",
+ "
\n",
+ " \n",
+ " | 17 | \n",
+ " 464 | \n",
+ " Tammuz 1288 | \n",
+ " Seleucid | \n",
+ " 0977-06-21/0977-07-19 | \n",
+ "
\n",
+ " \n",
+ " | 23 | \n",
+ " 472 | \n",
+ " 1337 | \n",
+ " Seleucid | \n",
+ " 1025-08-28/1026-09-14 | \n",
+ "
\n",
+ " \n",
+ " | 36 | \n",
+ " 491 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1131 | \n",
+ "
\n",
+ " \n",
+ " | 41 | \n",
+ " 499 | \n",
+ " Wednesday, 15 Kislev 1500 | \n",
+ " Seleucid | \n",
+ " 1188-12-07 | \n",
+ "
\n",
+ " \n",
+ " | 43 | \n",
+ " 502 | \n",
+ " Tevet 1548 | \n",
+ " Seleucid | \n",
+ " 1236-11-30/1236-12-28 | \n",
+ "
\n",
+ " \n",
+ " | 47 | \n",
+ " 506 | \n",
+ " Elul 1428 | \n",
+ " Seleucid | \n",
+ " 1117-08-01/1117-08-29 | \n",
+ "
\n",
+ " \n",
+ " | 55 | \n",
+ " 516 | \n",
+ " First decade of Ḥeshvan 1442 | \n",
+ " Seleucid | \n",
+ " 1130-10-06/1130-10-15 | \n",
+ "
\n",
+ " \n",
+ " | 61 | \n",
+ " 524 | \n",
+ " Thursday, 12 Sivan 4795 | \n",
+ " Anno Mundi | \n",
+ " 1035-05-22 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pgpid doc_date_original doc_date_calendar \\\n",
+ "5 449 1570 Seleucid \n",
+ "16 463 19 Adar 1427 Seleucid \n",
+ "17 464 Tammuz 1288 Seleucid \n",
+ "23 472 1337 Seleucid \n",
+ "36 491 NaN NaN \n",
+ "41 499 Wednesday, 15 Kislev 1500 Seleucid \n",
+ "43 502 Tevet 1548 Seleucid \n",
+ "47 506 Elul 1428 Seleucid \n",
+ "55 516 First decade of Ḥeshvan 1442 Seleucid \n",
+ "61 524 Thursday, 12 Sivan 4795 Anno Mundi \n",
+ "\n",
+ " doc_date_standard \n",
+ "5 1259 \n",
+ "16 1116-03-05 \n",
+ "17 0977-06-21/0977-07-19 \n",
+ "23 1025-08-28/1026-09-14 \n",
+ "36 1131 \n",
+ "41 1188-12-07 \n",
+ "43 1236-11-30/1236-12-28 \n",
+ "47 1117-08-01/1117-08-29 \n",
+ "55 1130-10-06/1130-10-15 \n",
+ "61 1035-05-22 "
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "docs_with_docdate[['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard']].head(10)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4df9e446-1f9c-4585-9557-3164cc8ce01f",
+ "metadata": {},
+ "source": [
+ "## Parse dates (standard and original)\n",
+ "\n",
+ "Parse the standardized date (Julian/Gregorian) as EDTF; in some cases this may fail due to invalid user-entered data."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "b9703b47-a7e2-4178-a7da-fb47db11b5b7",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Parse error on 1217-02-20/1217-02-29: Error trying to process rule \"date\":\n",
+ "\n",
+ "Day out of range in datetime string \"1217-02-29\"\n",
+ "Parse error on 1747-02-29: Error trying to process rule \"date\":\n",
+ "\n",
+ "Day out of range in datetime string \"1747-02-29\"\n"
+ ]
+ }
+ ],
+ "source": [
+ "from lark.visitors import VisitError\n",
+ "\n",
+ "# first, how far can we get with the standard dates? can we parse as edtf and sort, render?\n",
+ "from undate import Undate \n",
+ "\n",
+ "def parse_standard_date(value):\n",
+ " try:\n",
+ " return Undate.parse(value, \"EDTF\")\n",
+ " except VisitError as err:\n",
+ " print(f\"Parse error on {value}: {err}\")\n",
+ " \n",
+ "\n",
+ "# ignore gregorian/julian distinction for now\n",
+ "# from pgp code:\n",
+ "# Julian Thursday, 4 October 1582, being followed by Gregorian Friday, 15 October\n",
+ "# cut off between gregorian/julian dates, in julian days\n",
+ "#gregorian_start_jd = convertdate.julianday.from_julian(1582, 10, 5)\n",
+ "\n",
+ "docs_with_docdate['undate_standard'] = docs_with_docdate.doc_date_standard.apply(parse_standard_date)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "f49e82a4-b05b-4395-998f-0c9e75729e9f",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pgpid | \n",
+ " doc_date_original | \n",
+ " doc_date_calendar | \n",
+ " doc_date_standard | \n",
+ " last_modified | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 3190 | \n",
+ " 3957 | \n",
+ " middle decade of Adar 1528 | \n",
+ " Seleucid | \n",
+ " 1217-02-20/1217-02-29 | \n",
+ " 2025-04-12 20:45:36.603800+00:00 | \n",
+ "
\n",
+ " \n",
+ " | 34437 | \n",
+ " 40006 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1747-02-29 | \n",
+ " 2024-08-07 18:24:19.425288+00:00 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pgpid doc_date_original doc_date_calendar \\\n",
+ "3190 3957 middle decade of Adar 1528 Seleucid \n",
+ "34437 40006 NaN NaN \n",
+ "\n",
+ " doc_date_standard last_modified \n",
+ "3190 1217-02-20/1217-02-29 2025-04-12 20:45:36.603800+00:00 \n",
+ "34437 1747-02-29 2024-08-07 18:24:19.425288+00:00 "
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# what are the records with standardized dates that couldn't be parsed?\n",
+ "\n",
+ "# this is probably a data error in the original\n",
+ "\n",
+ "docs_with_docdate[docs_with_docdate.undate_standard.isna()][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard', 'last_modified']]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3632e7f2-aae9-4136-9bb0-32789de34c4e",
+ "metadata": {},
+ "source": [
+ "What calendars are used by documents with original dates?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "2d502575-a2b4-4fce-9f59-6932275dfac2",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "doc_date_calendar\n",
+ "Seleucid 1604\n",
+ "Anno Mundi 1147\n",
+ "Hijrī 884\n",
+ "Kharājī 8\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "docs_with_docdate.doc_date_calendar.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "04e4ffb2-13e7-49cc-913b-2104b61aef16",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pgpid | \n",
+ " doc_date_original | \n",
+ " doc_date_calendar | \n",
+ " doc_date_standard | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 61 | \n",
+ " 524 | \n",
+ " Thursday, 12 Sivan 4795 | \n",
+ " Anno Mundi | \n",
+ " 1035-05-22 | \n",
+ "
\n",
+ " \n",
+ " | 90 | \n",
+ " 561 | \n",
+ " 10 Nisan 4716 | \n",
+ " Anno Mundi | \n",
+ " 0956-03-24 | \n",
+ "
\n",
+ " \n",
+ " | 111 | \n",
+ " 582 | \n",
+ " Thursday, 6 Adar 4996 | \n",
+ " Anno Mundi | \n",
+ " 1236-02-14 | \n",
+ "
\n",
+ " \n",
+ " | 119 | \n",
+ " 591 | \n",
+ " Sunday, 29 Tammuz 4898 | \n",
+ " Anno Mundi | \n",
+ " 1138-07-10 | \n",
+ "
\n",
+ " \n",
+ " | 131 | \n",
+ " 603 | \n",
+ " 4805/4806 | \n",
+ " Anno Mundi | \n",
+ " 1044-08-27/1045-09-13 | \n",
+ "
\n",
+ " \n",
+ " | 177 | \n",
+ " 660 | \n",
+ " 22 Sivan 4974 | \n",
+ " Anno Mundi | \n",
+ " 1214-06-01 | \n",
+ "
\n",
+ " \n",
+ " | 207 | \n",
+ " 695 | \n",
+ " Friday, [25] Nisan [4810] | \n",
+ " Anno Mundi | \n",
+ " 1050-04-20 | \n",
+ "
\n",
+ " \n",
+ " | 215 | \n",
+ " 703 | \n",
+ " 8 Elul (4)811 | \n",
+ " Anno Mundi | \n",
+ " 1051-08-18 | \n",
+ "
\n",
+ " \n",
+ " | 255 | \n",
+ " 750 | \n",
+ " Friday, 24 Ḥeshvan 4765 | \n",
+ " Anno Mundi | \n",
+ " 1004-11-10 | \n",
+ "
\n",
+ " \n",
+ " | 264 | \n",
+ " 760 | \n",
+ " Thursday, 11 Av 4783 | \n",
+ " Anno Mundi | \n",
+ " 1023-08-01 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pgpid doc_date_original doc_date_calendar doc_date_standard\n",
+ "61 524 Thursday, 12 Sivan 4795 Anno Mundi 1035-05-22\n",
+ "90 561 10 Nisan 4716 Anno Mundi 0956-03-24\n",
+ "111 582 Thursday, 6 Adar 4996 Anno Mundi 1236-02-14\n",
+ "119 591 Sunday, 29 Tammuz 4898 Anno Mundi 1138-07-10\n",
+ "131 603 4805/4806 Anno Mundi 1044-08-27/1045-09-13\n",
+ "177 660 22 Sivan 4974 Anno Mundi 1214-06-01\n",
+ "207 695 Friday, [25] Nisan [4810] Anno Mundi 1050-04-20\n",
+ "215 703 8 Elul (4)811 Anno Mundi 1051-08-18\n",
+ "255 750 Friday, 24 Ḥeshvan 4765 Anno Mundi 1004-11-10\n",
+ "264 760 Thursday, 11 Av 4783 Anno Mundi 1023-08-01"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# example hebrew dates\n",
+ "docs_with_docdate[docs_with_docdate.doc_date_calendar == \"Anno Mundi\"][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard']].head(10)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "101b8194-35b3-4e7e-b3e4-68dfec2e932c",
+ "metadata": {},
+ "source": [
+ "### Inspect variations in the data that may cause problems for parsing\n",
+ "\n",
+ "There are some ideosyncrasies with the original dates, since some of them were entered before the PGPv4 system supported built-in conversion.\n",
+ "\n",
+ "- calendar abbreviation included in the date string (i.e., AM, AH for _Anno Mundi_, _Anno Hegirae_ respectively)\n",
+ "- brackets for inferred digits or unknown digits (e.g., `152[.]` or `[4]82[.]`)\n",
+ "- ordinals instead of numerals for the day of the month (e.g., \"11th Tammuz 4767\" or \"Monday, 27th Ṭevet 4797\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "4d11e583-7c80-44ed-80b1-d0c5b7b7f408",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/var/folders/mb/6qm4h4yx3yqdy2bv2sjyp4z00000gp/T/ipykernel_38072/1200615794.py:2: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n",
+ " hebrew_dates = docs_with_docdate[docs_with_docdate.doc_date_calendar == \"Anno Mundi\"][docs_with_docdate.doc_date_original.notna()]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pgpid | \n",
+ " doc_date_original | \n",
+ " doc_date_calendar | \n",
+ " doc_date_standard | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 702 | \n",
+ " 1223 | \n",
+ " Wednesday, 9 Tammuz 4912 AM | \n",
+ " Anno Mundi | \n",
+ " 1152-06-13 | \n",
+ "
\n",
+ " \n",
+ " | 16698 | \n",
+ " 19975 | \n",
+ " Sunday, 10 Kislev 5583 AM | \n",
+ " Anno Mundi | \n",
+ " 1822-11-24 | \n",
+ "
\n",
+ " \n",
+ " | 25415 | \n",
+ " 30550 | \n",
+ " Tammuz 5537 AM | \n",
+ " Anno Mundi | \n",
+ " 1777-07-06/1777-08-03 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pgpid doc_date_original doc_date_calendar \\\n",
+ "702 1223 Wednesday, 9 Tammuz 4912 AM Anno Mundi \n",
+ "16698 19975 Sunday, 10 Kislev 5583 AM Anno Mundi \n",
+ "25415 30550 Tammuz 5537 AM Anno Mundi \n",
+ "\n",
+ " doc_date_standard \n",
+ "702 1152-06-13 \n",
+ "16698 1822-11-24 \n",
+ "25415 1777-07-06/1777-08-03 "
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# how many end with AM ?\n",
+ "hebrew_dates = docs_with_docdate[docs_with_docdate.doc_date_calendar == \"Anno Mundi\"][docs_with_docdate.doc_date_original.notna()]\n",
+ "hebrew_dates[hebrew_dates.doc_date_original.str.endswith(\"AM\")][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard']]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "cd1a751a-5299-418f-a3f8-050ab0384354",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pgpid | \n",
+ " doc_date_original | \n",
+ " doc_date_calendar | \n",
+ " doc_date_standard | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 1556 | \n",
+ " 2163 | \n",
+ " first third of Tammuz 500[.] | \n",
+ " Anno Mundi | \n",
+ " 1244/1249 | \n",
+ "
\n",
+ " \n",
+ " | 1567 | \n",
+ " 2175 | \n",
+ " End of Sivan 152[.] | \n",
+ " Seleucid | \n",
+ " 1209/1218 | \n",
+ "
\n",
+ " \n",
+ " | 1753 | \n",
+ " 2460 | \n",
+ " 13[..] | \n",
+ " Seleucid | \n",
+ " 988/1088 | \n",
+ "
\n",
+ " \n",
+ " | 2018 | \n",
+ " 2745 | \n",
+ " 1[.] Kislev 48[..] | \n",
+ " Anno Mundi | \n",
+ " 1039-11-30/1138-11-24 | \n",
+ "
\n",
+ " \n",
+ " | 3044 | \n",
+ " 3805 | \n",
+ " 13[..] | \n",
+ " Seleucid | \n",
+ " 988/1087 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 30589 | \n",
+ " 35955 | \n",
+ " 12 Muḥarram 52[.] | \n",
+ " Hijrī | \n",
+ " 1126/1134 | \n",
+ "
\n",
+ " \n",
+ " | 31226 | \n",
+ " 36738 | \n",
+ " 54[.] | \n",
+ " Hijrī | \n",
+ " 1145/1154 | \n",
+ "
\n",
+ " \n",
+ " | 32548 | \n",
+ " 38077 | \n",
+ " 14[...] | \n",
+ " Seleucid | \n",
+ " 1088-09-19/1188-09-23 | \n",
+ "
\n",
+ " \n",
+ " | 34652 | \n",
+ " 40226 | \n",
+ " 49[.] | \n",
+ " Hijrī | \n",
+ " 1096-12-19/1106-09-01 | \n",
+ "
\n",
+ " \n",
+ " | 34760 | \n",
+ " 40335 | \n",
+ " [4]82[.] | \n",
+ " Anno Mundi | \n",
+ " 1059-09-11/1069-09-18 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
66 rows × 4 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pgpid doc_date_original doc_date_calendar \\\n",
+ "1556 2163 first third of Tammuz 500[.] Anno Mundi \n",
+ "1567 2175 End of Sivan 152[.] Seleucid \n",
+ "1753 2460 13[..] Seleucid \n",
+ "2018 2745 1[.] Kislev 48[..] Anno Mundi \n",
+ "3044 3805 13[..] Seleucid \n",
+ "... ... ... ... \n",
+ "30589 35955 12 Muḥarram 52[.] Hijrī \n",
+ "31226 36738 54[.] Hijrī \n",
+ "32548 38077 14[...] Seleucid \n",
+ "34652 40226 49[.] Hijrī \n",
+ "34760 40335 [4]82[.] Anno Mundi \n",
+ "\n",
+ " doc_date_standard \n",
+ "1556 1244/1249 \n",
+ "1567 1209/1218 \n",
+ "1753 988/1088 \n",
+ "2018 1039-11-30/1138-11-24 \n",
+ "3044 988/1087 \n",
+ "... ... \n",
+ "30589 1126/1134 \n",
+ "31226 1145/1154 \n",
+ "32548 1088-09-19/1188-09-23 \n",
+ "34652 1096-12-19/1106-09-01 \n",
+ "34760 1059-09-11/1069-09-18 \n",
+ "\n",
+ "[66 rows x 4 columns]"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# how many include periods?\n",
+ "docs_with_docdate[docs_with_docdate.doc_date_original.notna() & docs_with_docdate.doc_date_original.str.contains(\"\\\\.\")][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard']]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "9fa8d2ba-6612-4de5-8741-dea177f99412",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pgpid | \n",
+ " doc_date_original | \n",
+ " doc_date_calendar | \n",
+ " doc_date_standard | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 635 | \n",
+ " 1154 | \n",
+ " Last decade of Kislev 5004 | \n",
+ " Anno Mundi | \n",
+ " 1243-12 | \n",
+ "
\n",
+ " \n",
+ " | 1172 | \n",
+ " 1750 | \n",
+ " 11th Tammuz 4767 | \n",
+ " Anno Mundi | \n",
+ " 1007 | \n",
+ "
\n",
+ " \n",
+ " | 1173 | \n",
+ " 1751 | \n",
+ " Monday, 27th Ṭevet 4797 | \n",
+ " Anno Mundi | \n",
+ " 1037-01-23 | \n",
+ "
\n",
+ " \n",
+ " | 1556 | \n",
+ " 2163 | \n",
+ " first third of Tammuz 500[.] | \n",
+ " Anno Mundi | \n",
+ " 1244/1249 | \n",
+ "
\n",
+ " \n",
+ " | 5142 | \n",
+ " 6795 | \n",
+ " last decade of Tishrei 4991 | \n",
+ " Anno Mundi | \n",
+ " 1230-09-29/1230-10-08 | \n",
+ "
\n",
+ " \n",
+ " | 5223 | \n",
+ " 6892 | \n",
+ " last decade of Iyyar 4906 | \n",
+ " Anno Mundi | \n",
+ " 1146-05-04/1146-05-13 | \n",
+ "
\n",
+ " \n",
+ " | 5664 | \n",
+ " 7409 | \n",
+ " last third of Ḥeshvan 4965 | \n",
+ " Anno Mundi | \n",
+ " 1204-10-17/1204-10-25 | \n",
+ "
\n",
+ " \n",
+ " | 5812 | \n",
+ " 7581 | \n",
+ " middle third of Adar 4876 | \n",
+ " Anno Mundi | \n",
+ " 1116-05 | \n",
+ "
\n",
+ " \n",
+ " | 7024 | \n",
+ " 9068 | \n",
+ " Last decade of Ṭevet 4898 | \n",
+ " Anno Mundi | \n",
+ " 1138-01 | \n",
+ "
\n",
+ " \n",
+ " | 8638 | \n",
+ " 11215 | \n",
+ " Middle third of Av 4889 | \n",
+ " Anno Mundi | \n",
+ " 1129-07-29/1129-08-07 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pgpid doc_date_original doc_date_calendar \\\n",
+ "635 1154 Last decade of Kislev 5004 Anno Mundi \n",
+ "1172 1750 11th Tammuz 4767 Anno Mundi \n",
+ "1173 1751 Monday, 27th Ṭevet 4797 Anno Mundi \n",
+ "1556 2163 first third of Tammuz 500[.] Anno Mundi \n",
+ "5142 6795 last decade of Tishrei 4991 Anno Mundi \n",
+ "5223 6892 last decade of Iyyar 4906 Anno Mundi \n",
+ "5664 7409 last third of Ḥeshvan 4965 Anno Mundi \n",
+ "5812 7581 middle third of Adar 4876 Anno Mundi \n",
+ "7024 9068 Last decade of Ṭevet 4898 Anno Mundi \n",
+ "8638 11215 Middle third of Av 4889 Anno Mundi \n",
+ "\n",
+ " doc_date_standard \n",
+ "635 1243-12 \n",
+ "1172 1007 \n",
+ "1173 1037-01-23 \n",
+ "1556 1244/1249 \n",
+ "5142 1230-09-29/1230-10-08 \n",
+ "5223 1146-05-04/1146-05-13 \n",
+ "5664 1204-10-17/1204-10-25 \n",
+ "5812 1116-05 \n",
+ "7024 1138-01 \n",
+ "8638 1129-07-29/1129-08-07 "
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# how many use ordinals instead of numerals?\n",
+ "hebrew_dates[hebrew_dates.doc_date_original.str.contains(\"st\") | hebrew_dates.doc_date_original.str.contains(\"rd\") | hebrew_dates.doc_date_original.str.contains(\"th\")][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard']].head(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "5b6d5811-fe81-471d-bd29-896cec4c98ff",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "11th Tammuz 4767: 11 Tammuz 4767\n",
+ "27th Tevet: 27 Tevet\n",
+ "8th Kislev: 8 Kislev\n"
+ ]
+ }
+ ],
+ "source": [
+ "import re\n",
+ "\n",
+ "def remove_ordinals(val):\n",
+ " return re.sub(r'(\\d+)(st|nd|rd|th)', \"\\\\1\", val)\n",
+ "\n",
+ "# test removing ordinals without removing the numbers\n",
+ "for val in ['11th Tammuz 4767', \"27th Tevet\", \"8th Kislev\"]:\n",
+ " print(f\"{val}: { remove_ordinals(val)}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7b0347b7-954b-4d2e-ad95-44dc2e24ac01",
+ "metadata": {},
+ "source": [
+ "Since this dataset has a mix of calendars and has known inconsistencies that may need cleaning,\n",
+ "we define a custom parsing method that selects the appropriate calendar and simplifies date portions that are not currently supported by the undate parsers."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "798da8f2-2332-48c2-aeec-214474e9d49c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# parse hijri, anno mundi, and seleucid dates as undates\n",
+ "\n",
+ "from lark.exceptions import UnexpectedEOF\n",
+ "\n",
+ "# set this to True to see details about parsing\n",
+ "VERBOSE_PARSE_OUTPUT = False \n",
+ "\n",
+ "\n",
+ "def parse_original_date(row):\n",
+ " # print(f\"PGPID {row.pgpid} {row.doc_date_original} ({row.doc_date_calendar})\")\n",
+ " undate_calendar = None\n",
+ " if row.doc_date_calendar == \"Anno Mundi\":\n",
+ " undate_calendar = \"Hebrew\"\n",
+ " elif row.doc_date_calendar == \"Hijrī\":\n",
+ " undate_calendar = \"Islamic\"\n",
+ " elif row.doc_date_calendar == \"Seleucid\":\n",
+ " # handle seleucid as hebrew with offset (adapt from pgp code)\n",
+ " undate_calendar = \"Seleucid\"\n",
+ "\n",
+ " \n",
+ " if undate_calendar:\n",
+ " value = row.doc_date_original\n",
+ "\n",
+ " # some dates have unknown digits, e.g. 1[.] Kislev 48[..] or 152[.]\n",
+ " # ... the calendar parser don't support this, even though Undate does support unknown digits\n",
+ " # in future, perhaps we can add missing digit logic with this syntax to share across appropriate parsers\n",
+ " if '[.' in value:\n",
+ " if VERBOSE_PARSE_OUTPUT:\n",
+ " print(f\"ignoring missing digits for now {value}\")\n",
+ " value = value.replace(\"[.]\", \"0\").replace(\"[..]\", \"00\").replace(\"[...]\", \"000\") \n",
+ " \n",
+ " # some dates have inferred numbers, e.g. Friday, [25] Nisan [4810] or 8 Elul (4)811'\n",
+ " # for now, just strip out brackets before parsing; \n",
+ " # in future, could potentially infer uncertainty based on these\n",
+ " value = value.replace('[', '').replace(']', '').replace('(', '').replace(')', '')\n",
+ "\n",
+ " # for now, remove modifiers that are not supported by undate parser:\n",
+ " # Late Tevet 4903, Last decade of Kislev 5004, first third of ...\n",
+ " # some dates include of, e.g. day of month\n",
+ " modifiers = [\"Late \", \"(first|middle|last)( third|half|decade|tenth)? (of )?\", \"(Beginning|end) of \", \"last day\", \"First 10 days\", \" of\", \"spring\", \"decade \", \"night, \"]\n",
+ " for mod in modifiers:\n",
+ " value = re.sub(mod, \"\", value, flags=re.I)\n",
+ "\n",
+ " # there are a handful of misspelled wednesdays...\n",
+ " value = value.replace(\"Wedensday\", \"Wednesday\")\n",
+ " # and a Thrusday\n",
+ " value = value.replace(\"Thrusday\", \"Thursday\")\n",
+ "\n",
+ " # three Hebrew calendar dates include text \"AM\" at end; at least one AH date\n",
+ " if value.endswith(\" AM\") or value.endswith(\" AH\"):\n",
+ " value = value[:-3]\n",
+ " if value.endswith(\".\"): # strip off trailing period\n",
+ " value = value[:-1]\n",
+ "\n",
+ " # about 62 have ordinals; strip them out\n",
+ " value = remove_ordinals(value)\n",
+ " \n",
+ " try:\n",
+ " return Undate.parse(value, undate_calendar)\n",
+ " except (VisitError, ValueError, UnexpectedEOF) as err:\n",
+ " if VERBOSE_PARSE_OUTPUT:\n",
+ " print(f\"Parse error on PGPID {row.pgpid} {value} ({undate_calendar}): {err}\")\n",
+ "\n",
+ " # there are a handful of cases in PGP where calendars are mixed,\n",
+ " # i.e. hebrew months used for hijri calendar\n",
+ "\n",
+ " # some dates are entered in ISO format for another calendar; can we parse and set calendar?\n",
+ " if \"-\" in value and \"/\" not in value: # exclude intervals for now\n",
+ " try:\n",
+ " parsed = Undate.parse(value, \"ISO8601\")\n",
+ " if parsed:\n",
+ " parsed = parsed.as_calendar(undate_calendar)\n",
+ " if VERBOSE_PARSE_OUTPUT:\n",
+ " print(f\"parsed {value} with ISO8601 format and calendar {undate_calendar}, result is {parsed} ({parsed.earliest}/{parsed.latest})\")\n",
+ " return parsed\n",
+ " except ValueError as err:\n",
+ " if VERBOSE_PARSE_OUTPUT:\n",
+ " print(f\"Could not parse {value} as ISO date: {err}\")\n",
+ "\n",
+ "docs_with_docdate['undate_orig'] = docs_with_docdate.apply(parse_original_date, axis=1)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6b2bfb96-2d8b-4f09-a9a6-c2534273d503",
+ "metadata": {},
+ "source": [
+ "### Review parsing results \n",
+ "\n",
+ "How many of the dates in supported calendars were parsed?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "623eb160-ab6c-44ba-b3f4-6770c2c7bd86",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "original dates parsed: 3462\n",
+ "original dates unparsed: 173 (anno mundi, hijri, and seleucid calendars)\n",
+ "proportion parsed: 95.24%\n"
+ ]
+ }
+ ],
+ "source": [
+ "orig_dates_parsed = docs_with_docdate[docs_with_docdate.undate_orig.notna()].copy()\n",
+ "orig_dates_unparsed = docs_with_docdate[docs_with_docdate.doc_date_original.notna() & docs_with_docdate.doc_date_calendar.isin(['Anno Mundi', 'Hijrī', 'Seleucid']) & docs_with_docdate.undate_orig.isna()] \n",
+ "\n",
+ "total_parsed = len(orig_dates_parsed)\n",
+ "total_unparsed = len(orig_dates_unparsed)\n",
+ "print(f\"\"\"original dates parsed: {total_parsed}\n",
+ "original dates unparsed: {total_unparsed} (anno mundi, hijri, and seleucid calendars)\n",
+ "proportion parsed: {(total_parsed/(total_parsed + total_unparsed))*100:0.2f}%\"\"\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ae5b3cfa-ed25-4a3d-ae78-c7590543ba20",
+ "metadata": {},
+ "source": [
+ "What is the date granularity of the dates that were parsed?\n",
+ "\n",
+ "Note that these results are skewed somewhat due to the modifiers and uncertainty that we are simplifying in order to parse the dates."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "42945787-6788-422d-9a04-f983ec6b31af",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pgpid | \n",
+ " doc_date_original | \n",
+ " doc_date_calendar | \n",
+ " doc_date_standard | \n",
+ " undate_standard | \n",
+ " undate_orig | \n",
+ " orig_date_precision | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 5 | \n",
+ " 449 | \n",
+ " 1570 | \n",
+ " Seleucid | \n",
+ " 1259 | \n",
+ " 1259 | \n",
+ " 1570 | \n",
+ " year | \n",
+ "
\n",
+ " \n",
+ " | 16 | \n",
+ " 463 | \n",
+ " 19 Adar 1427 | \n",
+ " Seleucid | \n",
+ " 1116-03-05 | \n",
+ " 1116-03-05 | \n",
+ " 1427-12-19 | \n",
+ " day | \n",
+ "
\n",
+ " \n",
+ " | 17 | \n",
+ " 464 | \n",
+ " Tammuz 1288 | \n",
+ " Seleucid | \n",
+ " 0977-06-21/0977-07-19 | \n",
+ " 0977-06-21/0977-07-19 | \n",
+ " 1288-04 | \n",
+ " month | \n",
+ "
\n",
+ " \n",
+ " | 23 | \n",
+ " 472 | \n",
+ " 1337 | \n",
+ " Seleucid | \n",
+ " 1025-08-28/1026-09-14 | \n",
+ " 1025-08-28/1026-09-14 | \n",
+ " 1337 | \n",
+ " year | \n",
+ "
\n",
+ " \n",
+ " | 41 | \n",
+ " 499 | \n",
+ " Wednesday, 15 Kislev 1500 | \n",
+ " Seleucid | \n",
+ " 1188-12-07 | \n",
+ " 1188-12-07 | \n",
+ " 1500-09-15 | \n",
+ " day | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pgpid doc_date_original doc_date_calendar doc_date_standard \\\n",
+ "5 449 1570 Seleucid 1259 \n",
+ "16 463 19 Adar 1427 Seleucid 1116-03-05 \n",
+ "17 464 Tammuz 1288 Seleucid 0977-06-21/0977-07-19 \n",
+ "23 472 1337 Seleucid 1025-08-28/1026-09-14 \n",
+ "41 499 Wednesday, 15 Kislev 1500 Seleucid 1188-12-07 \n",
+ "\n",
+ " undate_standard undate_orig orig_date_precision \n",
+ "5 1259 1570 year \n",
+ "16 1116-03-05 1427-12-19 day \n",
+ "17 0977-06-21/0977-07-19 1288-04 month \n",
+ "23 1025-08-28/1026-09-14 1337 year \n",
+ "41 1188-12-07 1500-09-15 day "
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# determine original date precision based on parsed undate\n",
+ "orig_dates_parsed['orig_date_precision'] = orig_dates_parsed.undate_orig.apply(lambda x: str(x.precision).lower())\n",
+ "orig_dates_parsed[['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard', 'undate_standard', 'undate_orig', 'orig_date_precision']].head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "88f1d3ab-e1c7-48b5-8907-5aeea463f1e8",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "orig_date_precision\n",
+ "day 1599\n",
+ "month 1027\n",
+ "year 836\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# this is skewed because of the kinds of dates we're not able to parse or modifiers we're omitting entirely\n",
+ "orig_dates_parsed.orig_date_precision.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3fec8455-1830-48b5-961c-4ae0847bf63c",
+ "metadata": {},
+ "source": [
+ "Check on the Seleucid date parsing by comparing undate calendar conversion with the standardized CE date included in the dataset.\n",
+ "\n",
+ "We expect `undate` dates before 1583 to be off by about ~ 10 days since we did not adjust for Julian calendar."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "id": "5d3a55b0-ed36-47ba-b022-848bb128b449",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pgpid | \n",
+ " doc_date_original | \n",
+ " doc_date_calendar | \n",
+ " undate_orig | \n",
+ " orig_date_precision | \n",
+ " doc_date_standard | \n",
+ " undate_earliest | \n",
+ " undate_latest | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 5 | \n",
+ " 449 | \n",
+ " 1570 | \n",
+ " Seleucid | \n",
+ " 1570 | \n",
+ " year | \n",
+ " 1259 | \n",
+ " 1258-09-07 | \n",
+ " 1259-09-26 | \n",
+ "
\n",
+ " \n",
+ " | 16 | \n",
+ " 463 | \n",
+ " 19 Adar 1427 | \n",
+ " Seleucid | \n",
+ " 1427-12-19 | \n",
+ " day | \n",
+ " 1116-03-05 | \n",
+ " 1116-03-12 | \n",
+ " 1116-03-12 | \n",
+ "
\n",
+ " \n",
+ " | 17 | \n",
+ " 464 | \n",
+ " Tammuz 1288 | \n",
+ " Seleucid | \n",
+ " 1288-04 | \n",
+ " month | \n",
+ " 0977-06-21/0977-07-19 | \n",
+ " 0977-06-26 | \n",
+ " 0977-07-24 | \n",
+ "
\n",
+ " \n",
+ " | 23 | \n",
+ " 472 | \n",
+ " 1337 | \n",
+ " Seleucid | \n",
+ " 1337 | \n",
+ " year | \n",
+ " 1025-08-28/1026-09-14 | \n",
+ " 1025-09-03 | \n",
+ " 1026-09-20 | \n",
+ "
\n",
+ " \n",
+ " | 41 | \n",
+ " 499 | \n",
+ " Wednesday, 15 Kislev 1500 | \n",
+ " Seleucid | \n",
+ " 1500-09-15 | \n",
+ " day | \n",
+ " 1188-12-07 | \n",
+ " 1188-12-14 | \n",
+ " 1188-12-14 | \n",
+ "
\n",
+ " \n",
+ " | 43 | \n",
+ " 502 | \n",
+ " Tevet 1548 | \n",
+ " Seleucid | \n",
+ " 1548-10 | \n",
+ " month | \n",
+ " 1236-11-30/1236-12-28 | \n",
+ " 1236-12-07 | \n",
+ " 1237-01-04 | \n",
+ "
\n",
+ " \n",
+ " | 47 | \n",
+ " 506 | \n",
+ " Elul 1428 | \n",
+ " Seleucid | \n",
+ " 1428-06 | \n",
+ " month | \n",
+ " 1117-08-01/1117-08-29 | \n",
+ " 1117-08-08 | \n",
+ " 1117-09-05 | \n",
+ "
\n",
+ " \n",
+ " | 55 | \n",
+ " 516 | \n",
+ " First decade of Ḥeshvan 1442 | \n",
+ " Seleucid | \n",
+ " 1442-08 | \n",
+ " month | \n",
+ " 1130-10-06/1130-10-15 | \n",
+ " 1130-10-13 | \n",
+ " 1130-11-10 | \n",
+ "
\n",
+ " \n",
+ " | 73 | \n",
+ " 537 | \n",
+ " Ḥeshvan 1453 | \n",
+ " Seleucid | \n",
+ " 1453-08 | \n",
+ " month | \n",
+ " 1141 | \n",
+ " 1141-10-11 | \n",
+ " 1141-11-08 | \n",
+ "
\n",
+ " \n",
+ " | 75 | \n",
+ " 544 | \n",
+ " Sunday, 21 Kislev 1355 | \n",
+ " Seleucid | \n",
+ " 1355-09-21 | \n",
+ " day | \n",
+ " 1043-11-26 | \n",
+ " 1043-12-02 | \n",
+ " 1043-12-02 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pgpid doc_date_original doc_date_calendar undate_orig \\\n",
+ "5 449 1570 Seleucid 1570 \n",
+ "16 463 19 Adar 1427 Seleucid 1427-12-19 \n",
+ "17 464 Tammuz 1288 Seleucid 1288-04 \n",
+ "23 472 1337 Seleucid 1337 \n",
+ "41 499 Wednesday, 15 Kislev 1500 Seleucid 1500-09-15 \n",
+ "43 502 Tevet 1548 Seleucid 1548-10 \n",
+ "47 506 Elul 1428 Seleucid 1428-06 \n",
+ "55 516 First decade of Ḥeshvan 1442 Seleucid 1442-08 \n",
+ "73 537 Ḥeshvan 1453 Seleucid 1453-08 \n",
+ "75 544 Sunday, 21 Kislev 1355 Seleucid 1355-09-21 \n",
+ "\n",
+ " orig_date_precision doc_date_standard undate_earliest undate_latest \n",
+ "5 year 1259 1258-09-07 1259-09-26 \n",
+ "16 day 1116-03-05 1116-03-12 1116-03-12 \n",
+ "17 month 0977-06-21/0977-07-19 0977-06-26 0977-07-24 \n",
+ "23 year 1025-08-28/1026-09-14 1025-09-03 1026-09-20 \n",
+ "41 day 1188-12-07 1188-12-14 1188-12-14 \n",
+ "43 month 1236-11-30/1236-12-28 1236-12-07 1237-01-04 \n",
+ "47 month 1117-08-01/1117-08-29 1117-08-08 1117-09-05 \n",
+ "55 month 1130-10-06/1130-10-15 1130-10-13 1130-11-10 \n",
+ "73 month 1141 1141-10-11 1141-11-08 \n",
+ "75 day 1043-11-26 1043-12-02 1043-12-02 "
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "seleucid_dates = orig_dates_parsed[orig_dates_parsed.doc_date_calendar == 'Seleucid'].copy()\n",
+ "# add undate earliest/latest (Gregorian) for comparison with dataset standardized date \n",
+ "seleucid_dates['undate_earliest'] = seleucid_dates.undate_orig.apply(lambda x: x.earliest)\n",
+ "seleucid_dates['undate_latest'] = seleucid_dates.undate_orig.apply(lambda x: x.latest)\n",
+ "\n",
+ "seleucid_dates[['pgpid', 'doc_date_original', 'doc_date_calendar', 'undate_orig', 'orig_date_precision', 'doc_date_standard', 'undate_earliest', 'undate_latest']].head(10)\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "id": "a104d772-6c2c-4711-91ec-8cf1f108ae23",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# can we sort by parsed original dates? \n",
+ "# doesn't work currently because of overlapping dates / different granularity\n",
+ "#orig_dates_parsed.sort_values(by='undate_orig') #, key=lambda col: col.value.earliest)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "29f5f6eb-9b7d-4a4a-815a-29002d1d024b",
+ "metadata": {},
+ "source": [
+ "## Plot documents by date\n",
+ "\n",
+ "For the dates we could parse, how are the documents distributed over time and calendar?\n",
+ "\n",
+ "First let's graph by year based on the midpoint of the date range."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "id": "c653d928-8fec-4ddc-9abf-ace2f7ca6629",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# set earliest/latest for graphing\n",
+ "\n",
+ "# NOTE: we have to cast type to something pandas/altair supports\n",
+ "\n",
+ "orig_dates_parsed['orig_date_earliest'] = orig_dates_parsed.undate_orig.apply(lambda x: x.earliest).astype('datetime64[s]')\n",
+ "orig_dates_parsed['orig_date_latest'] = orig_dates_parsed.undate_orig.apply(lambda x: x.latest).astype('datetime64[s]')\n",
+ "orig_dates_parsed['orig_date_mid'] = orig_dates_parsed.undate_orig.apply(lambda x: x.earliest + (x.latest - x.earliest)/2).astype('datetime64[s]')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "id": "91f155fe-d0e6-4ee4-99de-698ac301e3f3",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " orig_date_earliest | \n",
+ " orig_date_latest | \n",
+ " orig_date_mid | \n",
+ " pgpid | \n",
+ " doc_date_calendar | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 5 | \n",
+ " 1258-09-07 | \n",
+ " 1259-09-26 | \n",
+ " 1259-03-18 | \n",
+ " 449 | \n",
+ " Seleucid | \n",
+ "
\n",
+ " \n",
+ " | 16 | \n",
+ " 1116-03-12 | \n",
+ " 1116-03-12 | \n",
+ " 1116-03-12 | \n",
+ " 463 | \n",
+ " Seleucid | \n",
+ "
\n",
+ " \n",
+ " | 17 | \n",
+ " 977-06-26 | \n",
+ " 977-07-24 | \n",
+ " 977-07-10 | \n",
+ " 464 | \n",
+ " Seleucid | \n",
+ "
\n",
+ " \n",
+ " | 23 | \n",
+ " 1025-09-03 | \n",
+ " 1026-09-20 | \n",
+ " 1026-03-13 | \n",
+ " 472 | \n",
+ " Seleucid | \n",
+ "
\n",
+ " \n",
+ " | 41 | \n",
+ " 1188-12-14 | \n",
+ " 1188-12-14 | \n",
+ " 1188-12-14 | \n",
+ " 499 | \n",
+ " Seleucid | \n",
+ "
\n",
+ " \n",
+ " | 43 | \n",
+ " 1236-12-07 | \n",
+ " 1237-01-04 | \n",
+ " 1236-12-21 | \n",
+ " 502 | \n",
+ " Seleucid | \n",
+ "
\n",
+ " \n",
+ " | 47 | \n",
+ " 1117-08-08 | \n",
+ " 1117-09-05 | \n",
+ " 1117-08-22 | \n",
+ " 506 | \n",
+ " Seleucid | \n",
+ "
\n",
+ " \n",
+ " | 55 | \n",
+ " 1130-10-13 | \n",
+ " 1130-11-10 | \n",
+ " 1130-10-27 | \n",
+ " 516 | \n",
+ " Seleucid | \n",
+ "
\n",
+ " \n",
+ " | 61 | \n",
+ " 1035-05-28 | \n",
+ " 1035-05-28 | \n",
+ " 1035-05-28 | \n",
+ " 524 | \n",
+ " Anno Mundi | \n",
+ "
\n",
+ " \n",
+ " | 62 | \n",
+ " 1034-08-25 | \n",
+ " 1034-09-22 | \n",
+ " 1034-09-08 | \n",
+ " 525 | \n",
+ " Hijrī | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " orig_date_earliest orig_date_latest orig_date_mid pgpid doc_date_calendar\n",
+ "5 1258-09-07 1259-09-26 1259-03-18 449 Seleucid\n",
+ "16 1116-03-12 1116-03-12 1116-03-12 463 Seleucid\n",
+ "17 977-06-26 977-07-24 977-07-10 464 Seleucid\n",
+ "23 1025-09-03 1026-09-20 1026-03-13 472 Seleucid\n",
+ "41 1188-12-14 1188-12-14 1188-12-14 499 Seleucid\n",
+ "43 1236-12-07 1237-01-04 1236-12-21 502 Seleucid\n",
+ "47 1117-08-08 1117-09-05 1117-08-22 506 Seleucid\n",
+ "55 1130-10-13 1130-11-10 1130-10-27 516 Seleucid\n",
+ "61 1035-05-28 1035-05-28 1035-05-28 524 Anno Mundi\n",
+ "62 1034-08-25 1034-09-22 1034-09-08 525 Hijrī"
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "orig_dates_parsed[['orig_date_earliest', 'orig_date_latest', 'orig_date_mid', 'pgpid', 'doc_date_calendar']].head(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "id": "144b2a4a-81cf-4a6d-a277-3a7910354a77",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.Chart(...)"
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# graph documents by calendar\n",
+ "import altair as alt\n",
+ "\n",
+ "date_docs_cal = orig_dates_parsed[orig_dates_parsed.doc_date_standard.notna()]\n",
+ "\n",
+ "dated_docs_cal = date_docs_cal.fillna({'doc_date_calendar': 'Unspecified'})\n",
+ "dated_docs_cal['midpoint_year'] = dated_docs_cal.orig_date_mid.apply(lambda x: x.year)\n",
+ "\n",
+ "orig_dates_calendars_chart = alt.Chart(dated_docs_cal[['pgpid', 'midpoint_year', 'doc_date_calendar']]).mark_area(opacity=0.7).encode(\n",
+ " x=alt.X('midpoint_year', title=\"Year (midpoint)\", bin=alt.Bin(maxbins=120), axis=alt.Axis(format=\"r\")),\n",
+ " y=alt.Y('count(pgpid)', title='Documents'),\n",
+ " color=alt.Y(\"doc_date_calendar\", title=\"Calendar\")\n",
+ ").properties(width=900, height=200, title=\"Documents by calendar (original date)\")\n",
+ "\n",
+ "orig_dates_calendars_chart"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a8e8cd7c-0711-40ae-84f6-d3f8df6d5ccc",
+ "metadata": {},
+ "source": [
+ "For comparison, what does it look like if we graph by the standardized dates in the dataset?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "id": "4acc9a2b-d403-4f93-b2c5-6fee92ead105",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.Chart(...)"
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# graph documents with calendars\n",
+ "\n",
+ "def undate_midpoint(value):\n",
+ " # parsed standard date could be an undate or an interval; handle either\n",
+ " if isinstance(value, Undate):\n",
+ " earliest = value.earliest\n",
+ " latest = value.latest\n",
+ " else: # interval\n",
+ " earliest = value.earliest.earliest\n",
+ " latest = value.latest.latest\n",
+ " return earliest + (latest - earliest)/2\n",
+ " \n",
+ "\n",
+ "dated_docs_cal = docs_with_docdate.copy()\n",
+ "dated_docs_cal = dated_docs_cal.fillna({'doc_date_calendar': 'Unspecified'})\n",
+ "# get the midpoint from the parsed standard date; convert to supported type\n",
+ "dated_docs_cal['midpoint'] = dated_docs_cal.undate_standard.apply(lambda x: undate_midpoint(x) if pd.notna(x) else None).astype(\"datetime64[s]\")\n",
+ "dated_docs_cal['midpoint_year'] = dated_docs_cal.midpoint.apply(lambda x: x.year if pd.notna(x) else None)\n",
+ "\n",
+ "\n",
+ "std_dates_calendars_chart = alt.Chart(dated_docs_cal[['pgpid', 'midpoint_year', 'doc_date_calendar']]).mark_area(opacity=0.7).encode(\n",
+ " x=alt.X('midpoint_year', title=\"Year\", bin=alt.Bin(maxbins=120), axis=alt.Axis(format=\"r\")),\n",
+ " y=alt.Y('count(pgpid)', title='Documents'),\n",
+ " color=alt.Y(\"doc_date_calendar\", title=\"Calendar\").scale(domain=['Anno Mundi', 'Hijrī', 'Seleucid', 'Kharājī', 'Unspecified'])\n",
+ ").properties(width=900, height=200, title=\"Documents by calendar (standard date)\")\n",
+ "\n",
+ "std_dates_calendars_chart"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f42471a4-0c64-4237-92c0-0d201377fa9f",
+ "metadata": {},
+ "source": [
+ "Here are the two plots together. The unspecified calendars are most likely Julian/Gregorian dates."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "id": "4d7c4d5f-636c-42a0-a906-21c67f5781b8",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.VConcatChart(...)"
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "orig_dates_calendars_chart & std_dates_calendars_chart"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "dc8a4617-ca69-4494-a2ef-6f4d442b82e6",
+ "metadata": {},
+ "source": [
+ "We can try graphing by range, but our parsing currently excludes the original dates with larger ranges."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "id": "c5861110-dbd5-4d7a-8ada-acf7cb871aa7",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.VConcatChart(...)"
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "graphable_data = orig_dates_parsed[['orig_date_earliest', 'orig_date_latest', 'orig_date_mid', 'pgpid', 'doc_date_calendar']].copy()\n",
+ "# graphable_data['midpoint'] = graphable_data.undate_standard.apply(lambda x: undate_midpoint(x) if pd.notna(x) else None).astype(\"datetime64[s]\")\n",
+ "graphable_data['midpoint_year'] = graphable_data.orig_date_mid.apply(lambda x: x.year if pd.notna(x) else None)\n",
+ "\n",
+ "\n",
+ "bar_chart = alt.Chart(graphable_data).mark_bar(opacity=0.5).encode(\n",
+ " x=alt.X('orig_date_earliest:T', title=\"original date (range)\"), # , axis=alt.Axis(format=\"r\")),\n",
+ " x2='orig_date_latest:T',\n",
+ " y=alt.Y('count(pgpid)', title='Count of Documents')\n",
+ ").properties(width=1200, height=150)\n",
+ "\n",
+ "line_chart = alt.Chart(graphable_data).mark_line(opacity=0.6, color=\"green\", interpolate=\"monotone\").encode(\n",
+ " x=alt.X('orig_date_mid:T', title=\"Year (midpoint)\"),\n",
+ " y=alt.Y('count(pgpid)', title='Documents')\n",
+ ").properties(width=1200, height=150)\n",
+ "\n",
+ "(bar_chart & line_chart).properties(title=\"Documents by date (1000-1300)\").interactive()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "951d92ea-4689-481c-8590-324b782a7a1c",
+ "metadata": {},
+ "source": [
+ "## Compare weekdays\n",
+ "\n",
+ "Sometimes the original date includes a day of the week; we don't expect these to be completely reliable, but lets compare the weekdays in the original date with the weekday as determined by the parsed `Undate`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "id": "3122a874-bb17-429f-993f-4bf7a76c1a36",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pgpid | \n",
+ " doc_date_original | \n",
+ " doc_date_calendar | \n",
+ " doc_date_standard | \n",
+ " undate_standard | \n",
+ " undate_orig | \n",
+ " orig_date_precision | \n",
+ " type | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 851 | \n",
+ " 1377 | \n",
+ " Wednesday night, 28 Sivan 1581 | \n",
+ " Seleucid | \n",
+ " 1270 | \n",
+ " 1270 | \n",
+ " 1581-03-28 | \n",
+ " day | \n",
+ " Legal document | \n",
+ "
\n",
+ " \n",
+ " | 1714 | \n",
+ " 2418 | \n",
+ " Monday 20 Tevet 1520 | \n",
+ " Seleucid | \n",
+ " 1208-12-29 | \n",
+ " 1208-12-29 | \n",
+ " 1520-10-20 | \n",
+ " day | \n",
+ " Legal document | \n",
+ "
\n",
+ " \n",
+ " | 1929 | \n",
+ " 2649 | \n",
+ " Sunday night, 25 Kislev 1444 | \n",
+ " Seleucid | \n",
+ " 1133 | \n",
+ " 1133 | \n",
+ " 1444-09-25 | \n",
+ " day | \n",
+ " Legal document | \n",
+ "
\n",
+ " \n",
+ " | 2013 | \n",
+ " 2739 | \n",
+ " Wednesday 29th Elul 1354 | \n",
+ " Seleucid | \n",
+ " 1043-09-07 | \n",
+ " 1043-09-07 | \n",
+ " 1354-06-29 | \n",
+ " day | \n",
+ " Legal document | \n",
+ "
\n",
+ " \n",
+ " | 3257 | \n",
+ " 4026 | \n",
+ " Wednesday night, 29 Tishrei 1541 | \n",
+ " Seleucid | \n",
+ " 1229-09-18 | \n",
+ " 1229-09-18 | \n",
+ " 1541-07-29 | \n",
+ " day | \n",
+ " Legal document | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 29303 | \n",
+ " 34623 | \n",
+ " Sunday night, 20 Ṭevet 1578 | \n",
+ " Seleucid | \n",
+ " 1266/1267 | \n",
+ " 1266/1267 | \n",
+ " 1578-10-20 | \n",
+ " day | \n",
+ " Legal document | \n",
+ "
\n",
+ " \n",
+ " | 29924 | \n",
+ " 35264 | \n",
+ " Wednesday 13 Ṭevet 1526 | \n",
+ " Seleucid | \n",
+ " 1214/1215 | \n",
+ " 1214/1215 | \n",
+ " 1526-10-13 | \n",
+ " day | \n",
+ " Legal document | \n",
+ "
\n",
+ " \n",
+ " | 34008 | \n",
+ " 39564 | \n",
+ " Monday 16 Tevet 1339 | \n",
+ " Seleucid | \n",
+ " 1027-12-18 | \n",
+ " 1027-12-18 | \n",
+ " 1339-10-16 | \n",
+ " day | \n",
+ " Legal document | \n",
+ "
\n",
+ " \n",
+ " | 34466 | \n",
+ " 40035 | \n",
+ " Monday 1st Iyyar 1437 | \n",
+ " Seleucid | \n",
+ " 1126-04-26 | \n",
+ " 1126-04-26 | \n",
+ " 1437-02-01 | \n",
+ " day | \n",
+ " Legal document | \n",
+ "
\n",
+ " \n",
+ " | 34467 | \n",
+ " 40036 | \n",
+ " Friday 15 of Adar 1443 | \n",
+ " Seleucid | \n",
+ " 1132-03-04 | \n",
+ " 1132-03-04 | \n",
+ " 1443-12-15 | \n",
+ " day | \n",
+ " Legal document | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
104 rows × 8 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pgpid doc_date_original doc_date_calendar \\\n",
+ "851 1377 Wednesday night, 28 Sivan 1581 Seleucid \n",
+ "1714 2418 Monday 20 Tevet 1520 Seleucid \n",
+ "1929 2649 Sunday night, 25 Kislev 1444 Seleucid \n",
+ "2013 2739 Wednesday 29th Elul 1354 Seleucid \n",
+ "3257 4026 Wednesday night, 29 Tishrei 1541 Seleucid \n",
+ "... ... ... ... \n",
+ "29303 34623 Sunday night, 20 Ṭevet 1578 Seleucid \n",
+ "29924 35264 Wednesday 13 Ṭevet 1526 Seleucid \n",
+ "34008 39564 Monday 16 Tevet 1339 Seleucid \n",
+ "34466 40035 Monday 1st Iyyar 1437 Seleucid \n",
+ "34467 40036 Friday 15 of Adar 1443 Seleucid \n",
+ "\n",
+ " doc_date_standard undate_standard undate_orig orig_date_precision \\\n",
+ "851 1270 1270 1581-03-28 day \n",
+ "1714 1208-12-29 1208-12-29 1520-10-20 day \n",
+ "1929 1133 1133 1444-09-25 day \n",
+ "2013 1043-09-07 1043-09-07 1354-06-29 day \n",
+ "3257 1229-09-18 1229-09-18 1541-07-29 day \n",
+ "... ... ... ... ... \n",
+ "29303 1266/1267 1266/1267 1578-10-20 day \n",
+ "29924 1214/1215 1214/1215 1526-10-13 day \n",
+ "34008 1027-12-18 1027-12-18 1339-10-16 day \n",
+ "34466 1126-04-26 1126-04-26 1437-02-01 day \n",
+ "34467 1132-03-04 1132-03-04 1443-12-15 day \n",
+ "\n",
+ " type \n",
+ "851 Legal document \n",
+ "1714 Legal document \n",
+ "1929 Legal document \n",
+ "2013 Legal document \n",
+ "3257 Legal document \n",
+ "... ... \n",
+ "29303 Legal document \n",
+ "29924 Legal document \n",
+ "34008 Legal document \n",
+ "34466 Legal document \n",
+ "34467 Legal document \n",
+ "\n",
+ "[104 rows x 8 columns]"
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "weekday_dates = orig_dates_parsed[orig_dates_parsed.doc_date_original.str.contains('day ')][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard', 'undate_standard', 'undate_orig', 'orig_date_precision', 'type']]\n",
+ "weekday_dates"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d9c03fd7-731c-44ce-ae2d-0bc1308790d0",
+ "metadata": {},
+ "source": [
+ "Extract the weekday from the original date and determine the undate weekday.\n",
+ "\n",
+ "Both Arabic and Hebrew days begin in the evening, so if the date string includes the text \"night\" we shift the original day by one for comparison."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "id": "3e4ea50c-b11c-433b-b6f9-691098b057d3",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pgpid | \n",
+ " doc_date_original | \n",
+ " doc_date_calendar | \n",
+ " doc_date_standard | \n",
+ " undate_standard | \n",
+ " undate_orig | \n",
+ " orig_date_precision | \n",
+ " type | \n",
+ " undate_weekday | \n",
+ " undate_weekday_name | \n",
+ " orig_weekday | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 851 | \n",
+ " 1377 | \n",
+ " Wednesday night, 28 Sivan 1581 | \n",
+ " Seleucid | \n",
+ " 1270 | \n",
+ " 1270 | \n",
+ " 1581-03-28 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 3 | \n",
+ " Thursday | \n",
+ " Thursday | \n",
+ "
\n",
+ " \n",
+ " | 1714 | \n",
+ " 2418 | \n",
+ " Monday 20 Tevet 1520 | \n",
+ " Seleucid | \n",
+ " 1208-12-29 | \n",
+ " 1208-12-29 | \n",
+ " 1520-10-20 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 0 | \n",
+ " Monday | \n",
+ " Monday | \n",
+ "
\n",
+ " \n",
+ " | 1929 | \n",
+ " 2649 | \n",
+ " Sunday night, 25 Kislev 1444 | \n",
+ " Seleucid | \n",
+ " 1133 | \n",
+ " 1133 | \n",
+ " 1444-09-25 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 0 | \n",
+ " Monday | \n",
+ " Monday | \n",
+ "
\n",
+ " \n",
+ " | 2013 | \n",
+ " 2739 | \n",
+ " Wednesday 29th Elul 1354 | \n",
+ " Seleucid | \n",
+ " 1043-09-07 | \n",
+ " 1043-09-07 | \n",
+ " 1354-06-29 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 2 | \n",
+ " Wednesday | \n",
+ " Wednesday | \n",
+ "
\n",
+ " \n",
+ " | 3257 | \n",
+ " 4026 | \n",
+ " Wednesday night, 29 Tishrei 1541 | \n",
+ " Seleucid | \n",
+ " 1229-09-18 | \n",
+ " 1229-09-18 | \n",
+ " 1541-07-29 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 3 | \n",
+ " Thursday | \n",
+ " Thursday | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 29303 | \n",
+ " 34623 | \n",
+ " Sunday night, 20 Ṭevet 1578 | \n",
+ " Seleucid | \n",
+ " 1266/1267 | \n",
+ " 1266/1267 | \n",
+ " 1578-10-20 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 0 | \n",
+ " Monday | \n",
+ " Monday | \n",
+ "
\n",
+ " \n",
+ " | 29924 | \n",
+ " 35264 | \n",
+ " Wednesday 13 Ṭevet 1526 | \n",
+ " Seleucid | \n",
+ " 1214/1215 | \n",
+ " 1214/1215 | \n",
+ " 1526-10-13 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 2 | \n",
+ " Wednesday | \n",
+ " Wednesday | \n",
+ "
\n",
+ " \n",
+ " | 34008 | \n",
+ " 39564 | \n",
+ " Monday 16 Tevet 1339 | \n",
+ " Seleucid | \n",
+ " 1027-12-18 | \n",
+ " 1027-12-18 | \n",
+ " 1339-10-16 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 0 | \n",
+ " Monday | \n",
+ " Monday | \n",
+ "
\n",
+ " \n",
+ " | 34466 | \n",
+ " 40035 | \n",
+ " Monday 1st Iyyar 1437 | \n",
+ " Seleucid | \n",
+ " 1126-04-26 | \n",
+ " 1126-04-26 | \n",
+ " 1437-02-01 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 0 | \n",
+ " Monday | \n",
+ " Monday | \n",
+ "
\n",
+ " \n",
+ " | 34467 | \n",
+ " 40036 | \n",
+ " Friday 15 of Adar 1443 | \n",
+ " Seleucid | \n",
+ " 1132-03-04 | \n",
+ " 1132-03-04 | \n",
+ " 1443-12-15 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 4 | \n",
+ " Friday | \n",
+ " Friday | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
104 rows × 11 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pgpid doc_date_original doc_date_calendar \\\n",
+ "851 1377 Wednesday night, 28 Sivan 1581 Seleucid \n",
+ "1714 2418 Monday 20 Tevet 1520 Seleucid \n",
+ "1929 2649 Sunday night, 25 Kislev 1444 Seleucid \n",
+ "2013 2739 Wednesday 29th Elul 1354 Seleucid \n",
+ "3257 4026 Wednesday night, 29 Tishrei 1541 Seleucid \n",
+ "... ... ... ... \n",
+ "29303 34623 Sunday night, 20 Ṭevet 1578 Seleucid \n",
+ "29924 35264 Wednesday 13 Ṭevet 1526 Seleucid \n",
+ "34008 39564 Monday 16 Tevet 1339 Seleucid \n",
+ "34466 40035 Monday 1st Iyyar 1437 Seleucid \n",
+ "34467 40036 Friday 15 of Adar 1443 Seleucid \n",
+ "\n",
+ " doc_date_standard undate_standard undate_orig orig_date_precision \\\n",
+ "851 1270 1270 1581-03-28 day \n",
+ "1714 1208-12-29 1208-12-29 1520-10-20 day \n",
+ "1929 1133 1133 1444-09-25 day \n",
+ "2013 1043-09-07 1043-09-07 1354-06-29 day \n",
+ "3257 1229-09-18 1229-09-18 1541-07-29 day \n",
+ "... ... ... ... ... \n",
+ "29303 1266/1267 1266/1267 1578-10-20 day \n",
+ "29924 1214/1215 1214/1215 1526-10-13 day \n",
+ "34008 1027-12-18 1027-12-18 1339-10-16 day \n",
+ "34466 1126-04-26 1126-04-26 1437-02-01 day \n",
+ "34467 1132-03-04 1132-03-04 1443-12-15 day \n",
+ "\n",
+ " type undate_weekday undate_weekday_name orig_weekday \n",
+ "851 Legal document 3 Thursday Thursday \n",
+ "1714 Legal document 0 Monday Monday \n",
+ "1929 Legal document 0 Monday Monday \n",
+ "2013 Legal document 2 Wednesday Wednesday \n",
+ "3257 Legal document 3 Thursday Thursday \n",
+ "... ... ... ... ... \n",
+ "29303 Legal document 0 Monday Monday \n",
+ "29924 Legal document 2 Wednesday Wednesday \n",
+ "34008 Legal document 0 Monday Monday \n",
+ "34466 Legal document 0 Monday Monday \n",
+ "34467 Legal document 4 Friday Friday \n",
+ "\n",
+ "[104 rows x 11 columns]"
+ ]
+ },
+ "execution_count": 28,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "days = [\"Monday\", \"Tuesday\", \"Wednesday\", \"Thursday\", \"Friday\", \"Saturday\", \"Sunday\"]\n",
+ "\n",
+ "# get numeric weekday; since these dates are all day-precision we can just use the earliest date\n",
+ "weekday_dates['undate_weekday'] = weekday_dates.undate_orig.apply(lambda x: x.earliest.weekday)\n",
+ "weekday_dates['undate_weekday_name'] = weekday_dates.undate_weekday.apply(lambda x: days[x])\n",
+ "# extract weekday from date label\n",
+ "weekday_dates['orig_weekday'] = weekday_dates.doc_date_original.str.extract('([a-zA-Z]+day)', expand=False).str.strip()\n",
+ "# correct misspellings\n",
+ "misspelled_days = {\n",
+ " \"Wedensday\": \"Wednesday\",\n",
+ " \"Thrusday\": \"Thursday\",\n",
+ "}\n",
+ "weekday_dates['orig_weekday'] = weekday_dates.orig_weekday.apply(lambda x: misspelled_days.get(x, x))\n",
+ "\n",
+ "# shift night to next day, e.g. Wednesday night should be Thursday\n",
+ "# NOTE: this must be done immediately after the day extraction, otherwise repeated runs continue shifting to the next day\n",
+ "def next_day(weekday):\n",
+ " return days[(days.index(weekday) +1) % 7]\n",
+ "\n",
+ "weekday_dates['orig_weekday'] = weekday_dates.apply(lambda row: next_day(row.orig_weekday) if \" night\" in row.doc_date_original else row.orig_weekday, axis=1)\n",
+ "\n",
+ "weekday_dates"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c3ab3428-9700-4e57-b3ff-329c737d98f7",
+ "metadata": {},
+ "source": [
+ "Here are the subset of records that specify \"night\":"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "id": "4ced7809-1414-44ae-aae7-c2d0d1dee9ad",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pgpid | \n",
+ " doc_date_original | \n",
+ " doc_date_calendar | \n",
+ " doc_date_standard | \n",
+ " undate_standard | \n",
+ " undate_orig | \n",
+ " orig_date_precision | \n",
+ " type | \n",
+ " undate_weekday | \n",
+ " undate_weekday_name | \n",
+ " orig_weekday | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 851 | \n",
+ " 1377 | \n",
+ " Wednesday night, 28 Sivan 1581 | \n",
+ " Seleucid | \n",
+ " 1270 | \n",
+ " 1270 | \n",
+ " 1581-03-28 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 3 | \n",
+ " Thursday | \n",
+ " Thursday | \n",
+ "
\n",
+ " \n",
+ " | 1929 | \n",
+ " 2649 | \n",
+ " Sunday night, 25 Kislev 1444 | \n",
+ " Seleucid | \n",
+ " 1133 | \n",
+ " 1133 | \n",
+ " 1444-09-25 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 0 | \n",
+ " Monday | \n",
+ " Monday | \n",
+ "
\n",
+ " \n",
+ " | 3257 | \n",
+ " 4026 | \n",
+ " Wednesday night, 29 Tishrei 1541 | \n",
+ " Seleucid | \n",
+ " 1229-09-18 | \n",
+ " 1229-09-18 | \n",
+ " 1541-07-29 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 3 | \n",
+ " Thursday | \n",
+ " Thursday | \n",
+ "
\n",
+ " \n",
+ " | 5511 | \n",
+ " 7237 | \n",
+ " Tuesday night, 22 Kislev 1435 | \n",
+ " Seleucid | \n",
+ " 1123-12-12 | \n",
+ " 1123-12-12 | \n",
+ " 1435-09-22 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 2 | \n",
+ " Wednesday | \n",
+ " Wednesday | \n",
+ "
\n",
+ " \n",
+ " | 5854 | \n",
+ " 7637 | \n",
+ " Monday night, 29 Ṭevet 1438 | \n",
+ " Seleucid | \n",
+ " 1127 | \n",
+ " 1127 | \n",
+ " 1438-10-29 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 4 | \n",
+ " Friday | \n",
+ " Tuesday | \n",
+ "
\n",
+ " \n",
+ " | 5857 | \n",
+ " 7642 | \n",
+ " Thursday night, 23 Tammuz 1538 | \n",
+ " Seleucid | \n",
+ " 1227-07-09 | \n",
+ " 1227-07-09 | \n",
+ " 1538-04-23 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 4 | \n",
+ " Friday | \n",
+ " Friday | \n",
+ "
\n",
+ " \n",
+ " | 6419 | \n",
+ " 8332 | \n",
+ " Friday night, 20 Iyar 4957 | \n",
+ " Anno Mundi | \n",
+ " 1197-05 | \n",
+ " 1197-05 | \n",
+ " 4957-02-20 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 5 | \n",
+ " Saturday | \n",
+ " Saturday | \n",
+ "
\n",
+ " \n",
+ " | 29303 | \n",
+ " 34623 | \n",
+ " Sunday night, 20 Ṭevet 1578 | \n",
+ " Seleucid | \n",
+ " 1266/1267 | \n",
+ " 1266/1267 | \n",
+ " 1578-10-20 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 0 | \n",
+ " Monday | \n",
+ " Monday | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pgpid doc_date_original doc_date_calendar \\\n",
+ "851 1377 Wednesday night, 28 Sivan 1581 Seleucid \n",
+ "1929 2649 Sunday night, 25 Kislev 1444 Seleucid \n",
+ "3257 4026 Wednesday night, 29 Tishrei 1541 Seleucid \n",
+ "5511 7237 Tuesday night, 22 Kislev 1435 Seleucid \n",
+ "5854 7637 Monday night, 29 Ṭevet 1438 Seleucid \n",
+ "5857 7642 Thursday night, 23 Tammuz 1538 Seleucid \n",
+ "6419 8332 Friday night, 20 Iyar 4957 Anno Mundi \n",
+ "29303 34623 Sunday night, 20 Ṭevet 1578 Seleucid \n",
+ "\n",
+ " doc_date_standard undate_standard undate_orig orig_date_precision \\\n",
+ "851 1270 1270 1581-03-28 day \n",
+ "1929 1133 1133 1444-09-25 day \n",
+ "3257 1229-09-18 1229-09-18 1541-07-29 day \n",
+ "5511 1123-12-12 1123-12-12 1435-09-22 day \n",
+ "5854 1127 1127 1438-10-29 day \n",
+ "5857 1227-07-09 1227-07-09 1538-04-23 day \n",
+ "6419 1197-05 1197-05 4957-02-20 day \n",
+ "29303 1266/1267 1266/1267 1578-10-20 day \n",
+ "\n",
+ " type undate_weekday undate_weekday_name orig_weekday \n",
+ "851 Legal document 3 Thursday Thursday \n",
+ "1929 Legal document 0 Monday Monday \n",
+ "3257 Legal document 3 Thursday Thursday \n",
+ "5511 Legal document 2 Wednesday Wednesday \n",
+ "5854 Legal document 4 Friday Tuesday \n",
+ "5857 Legal document 4 Friday Friday \n",
+ "6419 Legal document 5 Saturday Saturday \n",
+ "29303 Legal document 0 Monday Monday "
+ ]
+ },
+ "execution_count": 29,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "weekday_dates[weekday_dates.doc_date_original.str.contains(\" night\")]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "94b8aae8-6bc1-425c-b723-427356cfb647",
+ "metadata": {},
+ "source": [
+ "How many of the original and undate weekdays match?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "id": "fedb5323-0e9c-476e-a7e2-95443d2f9e1d",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "44 matches, 60 mismatches (42.31%)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pgpid | \n",
+ " doc_date_original | \n",
+ " doc_date_calendar | \n",
+ " doc_date_standard | \n",
+ " undate_standard | \n",
+ " undate_orig | \n",
+ " orig_date_precision | \n",
+ " type | \n",
+ " undate_weekday | \n",
+ " undate_weekday_name | \n",
+ " orig_weekday | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 5271 | \n",
+ " 6947 | \n",
+ " Monday 3 Iyyar 1740 | \n",
+ " Seleucid | \n",
+ " 1429-04-07 | \n",
+ " 1429-04-07 | \n",
+ " 1740-02-03 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 3 | \n",
+ " Thursday | \n",
+ " Monday | \n",
+ "
\n",
+ " \n",
+ " | 5854 | \n",
+ " 7637 | \n",
+ " Monday night, 29 Ṭevet 1438 | \n",
+ " Seleucid | \n",
+ " 1127 | \n",
+ " 1127 | \n",
+ " 1438-10-29 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 4 | \n",
+ " Friday | \n",
+ " Tuesday | \n",
+ "
\n",
+ " \n",
+ " | 8648 | \n",
+ " 11227 | \n",
+ " Monday 24 Jumādā I 517 | \n",
+ " Hijrī | \n",
+ " 1123-07-20 | \n",
+ " 1123-07-20 | \n",
+ " 0517-05-24 | \n",
+ " day | \n",
+ " Paraliterary text | \n",
+ " 4 | \n",
+ " Friday | \n",
+ " Monday | \n",
+ "
\n",
+ " \n",
+ " | 16397 | \n",
+ " 19649 | \n",
+ " Thursday 26 Iyyar 5306 | \n",
+ " Anno Mundi | \n",
+ " 1546-04-28 | \n",
+ " 1546-04-28 | \n",
+ " 5306-02-26 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 2 | \n",
+ " Wednesday | \n",
+ " Thursday | \n",
+ "
\n",
+ " \n",
+ " | 17723 | \n",
+ " 21094 | \n",
+ " Saturday 20 Rajab 550 | \n",
+ " Hijrī | \n",
+ " 1155-09-19 | \n",
+ " 1155-09-19 | \n",
+ " 0550-07-20 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 0 | \n",
+ " Monday | \n",
+ " Saturday | \n",
+ "
\n",
+ " \n",
+ " | 23099 | \n",
+ " 27479 | \n",
+ " Tuesday 11 Tammuz 5525 | \n",
+ " Anno Mundi | \n",
+ " 1765-06-30 | \n",
+ " 1765-06-30 | \n",
+ " 5525-04-11 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 6 | \n",
+ " Sunday | \n",
+ " Tuesday | \n",
+ "
\n",
+ " \n",
+ " | 23104 | \n",
+ " 27484 | \n",
+ " Friday 20th Shevat 5405 | \n",
+ " Anno Mundi | \n",
+ " 1645 | \n",
+ " 1645 | \n",
+ " 5405-11-20 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 3 | \n",
+ " Thursday | \n",
+ " Friday | \n",
+ "
\n",
+ " \n",
+ " | 23105 | \n",
+ " 27485 | \n",
+ " Sunday 22 Adar 5590 | \n",
+ " Anno Mundi | \n",
+ " 1830-03-17 | \n",
+ " 1830-03-17 | \n",
+ " 5590-12-22 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 2 | \n",
+ " Wednesday | \n",
+ " Sunday | \n",
+ "
\n",
+ " \n",
+ " | 23107 | \n",
+ " 27487 | \n",
+ " Thursday 15 Shevat 5450 | \n",
+ " Anno Mundi | \n",
+ " 1690-01-25 | \n",
+ " 1690-01-25 | \n",
+ " 5450-11-15 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 2 | \n",
+ " Wednesday | \n",
+ " Thursday | \n",
+ "
\n",
+ " \n",
+ " | 23109 | \n",
+ " 27489 | \n",
+ " Sunday 6 Nisan 5528 | \n",
+ " Anno Mundi | \n",
+ " 1768-03-24 | \n",
+ " 1768-03-24 | \n",
+ " 5528-01-06 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 3 | \n",
+ " Thursday | \n",
+ " Sunday | \n",
+ "
\n",
+ " \n",
+ " | 23110 | \n",
+ " 27490 | \n",
+ " Thursday 19th Elul 5428 | \n",
+ " Anno Mundi | \n",
+ " 1668 | \n",
+ " 1668 | \n",
+ " 5428-06-19 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 6 | \n",
+ " Sunday | \n",
+ " Thursday | \n",
+ "
\n",
+ " \n",
+ " | 23111 | \n",
+ " 27491 | \n",
+ " Tuesday 1 Kislev 5507 | \n",
+ " Anno Mundi | \n",
+ " 1746-11-14 | \n",
+ " 1746-11-14 | \n",
+ " 5507-09-01 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 0 | \n",
+ " Monday | \n",
+ " Tuesday | \n",
+ "
\n",
+ " \n",
+ " | 23116 | \n",
+ " 27496 | \n",
+ " Sunday 28 Elul 5511 | \n",
+ " Anno Mundi | \n",
+ " 1751-09-18 | \n",
+ " 1751-09-18 | \n",
+ " 5511-06-28 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 5 | \n",
+ " Saturday | \n",
+ " Sunday | \n",
+ "
\n",
+ " \n",
+ " | 23117 | \n",
+ " 27497 | \n",
+ " Sunday 17th Sivan 5423 | \n",
+ " Anno Mundi | \n",
+ " 1663 | \n",
+ " 1663 | \n",
+ " 5423-03-17 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 4 | \n",
+ " Friday | \n",
+ " Sunday | \n",
+ "
\n",
+ " \n",
+ " | 23118 | \n",
+ " 27498 | \n",
+ " Sunday 25th Tevet 5409 | \n",
+ " Anno Mundi | \n",
+ " 1648 | \n",
+ " 1648 | \n",
+ " 5409-10-25 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 5 | \n",
+ " Saturday | \n",
+ " Sunday | \n",
+ "
\n",
+ " \n",
+ " | 23120 | \n",
+ " 27500 | \n",
+ " Thursday 4 Sivan 5516 | \n",
+ " Anno Mundi | \n",
+ " 1756-06-02 | \n",
+ " 1756-06-02 | \n",
+ " 5516-03-04 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 2 | \n",
+ " Wednesday | \n",
+ " Thursday | \n",
+ "
\n",
+ " \n",
+ " | 23127 | \n",
+ " 27507 | \n",
+ " Sunday 25 Sivan 5556 | \n",
+ " Anno Mundi | \n",
+ " 1796-07-01 | \n",
+ " 1796-07-01 | \n",
+ " 5556-03-25 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 4 | \n",
+ " Friday | \n",
+ " Sunday | \n",
+ "
\n",
+ " \n",
+ " | 23131 | \n",
+ " 27511 | \n",
+ " Wednesday 28th Tevet 5399 | \n",
+ " Anno Mundi | \n",
+ " 1640 | \n",
+ " 1640 | \n",
+ " 5399-10-28 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 1 | \n",
+ " Tuesday | \n",
+ " Wednesday | \n",
+ "
\n",
+ " \n",
+ " | 23135 | \n",
+ " 27515 | \n",
+ " Monday 15th Iyyar 5414 | \n",
+ " Anno Mundi | \n",
+ " 1654 | \n",
+ " 1654 | \n",
+ " 5414-02-15 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 5 | \n",
+ " Saturday | \n",
+ " Monday | \n",
+ "
\n",
+ " \n",
+ " | 23136 | \n",
+ " 27516 | \n",
+ " Thursday 24 Nisan 5481 | \n",
+ " Anno Mundi | \n",
+ " 1721-04-21 | \n",
+ " 1721-04-21 | \n",
+ " 5481-01-24 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 0 | \n",
+ " Monday | \n",
+ " Thursday | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pgpid doc_date_original doc_date_calendar doc_date_standard \\\n",
+ "5271 6947 Monday 3 Iyyar 1740 Seleucid 1429-04-07 \n",
+ "5854 7637 Monday night, 29 Ṭevet 1438 Seleucid 1127 \n",
+ "8648 11227 Monday 24 Jumādā I 517 Hijrī 1123-07-20 \n",
+ "16397 19649 Thursday 26 Iyyar 5306 Anno Mundi 1546-04-28 \n",
+ "17723 21094 Saturday 20 Rajab 550 Hijrī 1155-09-19 \n",
+ "23099 27479 Tuesday 11 Tammuz 5525 Anno Mundi 1765-06-30 \n",
+ "23104 27484 Friday 20th Shevat 5405 Anno Mundi 1645 \n",
+ "23105 27485 Sunday 22 Adar 5590 Anno Mundi 1830-03-17 \n",
+ "23107 27487 Thursday 15 Shevat 5450 Anno Mundi 1690-01-25 \n",
+ "23109 27489 Sunday 6 Nisan 5528 Anno Mundi 1768-03-24 \n",
+ "23110 27490 Thursday 19th Elul 5428 Anno Mundi 1668 \n",
+ "23111 27491 Tuesday 1 Kislev 5507 Anno Mundi 1746-11-14 \n",
+ "23116 27496 Sunday 28 Elul 5511 Anno Mundi 1751-09-18 \n",
+ "23117 27497 Sunday 17th Sivan 5423 Anno Mundi 1663 \n",
+ "23118 27498 Sunday 25th Tevet 5409 Anno Mundi 1648 \n",
+ "23120 27500 Thursday 4 Sivan 5516 Anno Mundi 1756-06-02 \n",
+ "23127 27507 Sunday 25 Sivan 5556 Anno Mundi 1796-07-01 \n",
+ "23131 27511 Wednesday 28th Tevet 5399 Anno Mundi 1640 \n",
+ "23135 27515 Monday 15th Iyyar 5414 Anno Mundi 1654 \n",
+ "23136 27516 Thursday 24 Nisan 5481 Anno Mundi 1721-04-21 \n",
+ "\n",
+ " undate_standard undate_orig orig_date_precision type \\\n",
+ "5271 1429-04-07 1740-02-03 day Legal document \n",
+ "5854 1127 1438-10-29 day Legal document \n",
+ "8648 1123-07-20 0517-05-24 day Paraliterary text \n",
+ "16397 1546-04-28 5306-02-26 day Legal document \n",
+ "17723 1155-09-19 0550-07-20 day Legal document \n",
+ "23099 1765-06-30 5525-04-11 day Legal document \n",
+ "23104 1645 5405-11-20 day Legal document \n",
+ "23105 1830-03-17 5590-12-22 day Legal document \n",
+ "23107 1690-01-25 5450-11-15 day Legal document \n",
+ "23109 1768-03-24 5528-01-06 day Legal document \n",
+ "23110 1668 5428-06-19 day Legal document \n",
+ "23111 1746-11-14 5507-09-01 day Legal document \n",
+ "23116 1751-09-18 5511-06-28 day Legal document \n",
+ "23117 1663 5423-03-17 day Legal document \n",
+ "23118 1648 5409-10-25 day Legal document \n",
+ "23120 1756-06-02 5516-03-04 day Legal document \n",
+ "23127 1796-07-01 5556-03-25 day Legal document \n",
+ "23131 1640 5399-10-28 day Legal document \n",
+ "23135 1654 5414-02-15 day Legal document \n",
+ "23136 1721-04-21 5481-01-24 day Legal document \n",
+ "\n",
+ " undate_weekday undate_weekday_name orig_weekday \n",
+ "5271 3 Thursday Monday \n",
+ "5854 4 Friday Tuesday \n",
+ "8648 4 Friday Monday \n",
+ "16397 2 Wednesday Thursday \n",
+ "17723 0 Monday Saturday \n",
+ "23099 6 Sunday Tuesday \n",
+ "23104 3 Thursday Friday \n",
+ "23105 2 Wednesday Sunday \n",
+ "23107 2 Wednesday Thursday \n",
+ "23109 3 Thursday Sunday \n",
+ "23110 6 Sunday Thursday \n",
+ "23111 0 Monday Tuesday \n",
+ "23116 5 Saturday Sunday \n",
+ "23117 4 Friday Sunday \n",
+ "23118 5 Saturday Sunday \n",
+ "23120 2 Wednesday Thursday \n",
+ "23127 4 Friday Sunday \n",
+ "23131 1 Tuesday Wednesday \n",
+ "23135 5 Saturday Monday \n",
+ "23136 0 Monday Thursday "
+ ]
+ },
+ "execution_count": 30,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "matches = weekday_dates[weekday_dates.undate_weekday_name == weekday_dates.orig_weekday]\n",
+ "\n",
+ "mismatches = weekday_dates[weekday_dates.undate_weekday_name != weekday_dates.orig_weekday]\n",
+ "\n",
+ "print(f\"{len(matches)} matches, {len(mismatches)} mismatches ({(len(matches)/(len(matches)+len(mismatches)))*100:0.2f}%)\")\n",
+ "mismatches.head(20)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "492352af-76db-47b5-afa2-f5388c4d1d71",
+ "metadata": {},
+ "source": [
+ "Is there any noticable difference about where the mismatches are coming from based on calendar or day of week?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "id": "d6476907-1628-4d68-ab1f-43c95e123707",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "doc_date_calendar\n",
+ "Anno Mundi 55\n",
+ "Seleucid 3\n",
+ "Hijrī 2\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 31,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "mismatches.doc_date_calendar.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "id": "18b71d18-5d5b-4f92-8801-499bcf412efe",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "orig_weekday\n",
+ "Wednesday 17\n",
+ "Sunday 12\n",
+ "Monday 10\n",
+ "Thursday 9\n",
+ "Tuesday 7\n",
+ "Friday 4\n",
+ "Saturday 1\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 32,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "mismatches.orig_weekday.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "id": "eb7ea065-e4b5-47aa-9538-8dc9851ea572",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "1 mismatches that include text 'night'\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pgpid | \n",
+ " doc_date_original | \n",
+ " doc_date_calendar | \n",
+ " doc_date_standard | \n",
+ " undate_standard | \n",
+ " undate_orig | \n",
+ " orig_date_precision | \n",
+ " type | \n",
+ " undate_weekday | \n",
+ " undate_weekday_name | \n",
+ " orig_weekday | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 5854 | \n",
+ " 7637 | \n",
+ " Monday night, 29 Ṭevet 1438 | \n",
+ " Seleucid | \n",
+ " 1127 | \n",
+ " 1127 | \n",
+ " 1438-10-29 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 4 | \n",
+ " Friday | \n",
+ " Tuesday | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pgpid doc_date_original doc_date_calendar doc_date_standard \\\n",
+ "5854 7637 Monday night, 29 Ṭevet 1438 Seleucid 1127 \n",
+ "\n",
+ " undate_standard undate_orig orig_date_precision type \\\n",
+ "5854 1127 1438-10-29 day Legal document \n",
+ "\n",
+ " undate_weekday undate_weekday_name orig_weekday \n",
+ "5854 4 Friday Tuesday "
+ ]
+ },
+ "execution_count": 33,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# how many mismatches are due to night?\n",
+ "night_mismatches = mismatches[mismatches.doc_date_original.str.contains(\" night\")]\n",
+ "print(f\"{len(night_mismatches)} mismatches that include text 'night'\")\n",
+ "night_mismatches"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "16f9a9db-434f-407e-8613-42941b4f3a14",
+ "metadata": {},
+ "source": [
+ "### Plot document frequency by day\n",
+ "\n",
+ "Because we're preserving as much date information as possible, we can plost based on things like weekday - even across different calendars.\n",
+ "\n",
+ "For documents with day-level date precision, how are they distributed by weekday?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "id": "ece780b8-2eb2-4cbc-9195-27def665f7fa",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.Chart(...)"
+ ]
+ },
+ "execution_count": 34,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# get numeric weekday\n",
+ "orig_dates_parsed['undate_weekday'] = orig_dates_parsed.undate_orig.apply(lambda x: x.earliest.weekday)\n",
+ "orig_dates_parsed['undate_weekday_name'] = orig_dates_parsed.undate_weekday.apply(lambda x: days[x])\n",
+ "\n",
+ "# restrict to dates with day precision; the rest are just using earliest day\n",
+ "orig_dates_days = orig_dates_parsed[orig_dates_parsed.orig_date_precision == 'day']\n",
+ "\n",
+ "alt.Chart(orig_dates_days[['undate_weekday', 'undate_weekday_name', 'pgpid']]).mark_rect().encode(\n",
+ " alt.X('undate_weekday_name', sort=days, title='weekday'),\n",
+ " alt.Color('count(pgpid)', title='# of documents')\n",
+ ").properties(title='document frequency by weekday')\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "id": "6b2f24de-18ce-4f40-b300-e8cc334a338c",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "undate_weekday_name\n",
+ "Monday 305\n",
+ "Thursday 282\n",
+ "Tuesday 241\n",
+ "Sunday 229\n",
+ "Wednesday 229\n",
+ "Friday 215\n",
+ "Saturday 98\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 35,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "orig_dates_days.undate_weekday_name.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "id": "dea83b43-b379-4807-8a33-8e26d7f4f8e7",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.FacetChart(...)"
+ ]
+ },
+ "execution_count": 36,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "weekday_calendar_chart = alt.Chart(weekday_dates[['undate_weekday', 'undate_weekday_name', 'pgpid', 'doc_date_calendar']]).mark_rect().encode(\n",
+ " alt.X('undate_weekday_name', sort=days, title='weekday'),\n",
+ " # alt.Y('doc_date_calendar'),\n",
+ " alt.Color('count(pgpid)')\n",
+ ").facet(row=alt.Facet('doc_date_calendar', title=\"Original Calendar\")).properties(title='document frequency by weekday and calendar')\n",
+ "weekday_calendar_chart"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "484069be-8f75-4197-8f96-4683ab509028",
+ "metadata": {},
+ "source": [
+ "This chart is skewed due to the fact we have so many more day-precision dates from the Hebrew calendar than any other. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "id": "cfecdb64-03b4-405b-b1f3-85e876f55680",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "doc_date_calendar\n",
+ "Anno Mundi 82\n",
+ "Seleucid 20\n",
+ "Hijrī 2\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 37,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "weekday_dates.doc_date_calendar.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bfdfcf6b-d572-4f9b-8538-eca932f50942",
+ "metadata": {},
+ "source": [
+ "This is more obvious if we use indepenend color scales."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "id": "e66917b0-2221-42dd-a99b-df847b8e815b",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.FacetChart(...)"
+ ]
+ },
+ "execution_count": 38,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "weekday_calendar_chart.resolve_scale(color='independent')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8e2a74a1-546b-4069-bff5-29788dee8997",
+ "metadata": {},
+ "source": [
+ "What about weekday by centuy? "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "id": "6a7a0bf5-f8c2-4034-8495-2fb4b297740a",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pgpid | \n",
+ " doc_date_original | \n",
+ " doc_date_calendar | \n",
+ " doc_date_standard | \n",
+ " undate_standard | \n",
+ " undate_orig | \n",
+ " orig_date_precision | \n",
+ " type | \n",
+ " undate_weekday | \n",
+ " undate_weekday_name | \n",
+ " orig_weekday | \n",
+ " century | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 851 | \n",
+ " 1377 | \n",
+ " Wednesday night, 28 Sivan 1581 | \n",
+ " Seleucid | \n",
+ " 1270 | \n",
+ " 1270 | \n",
+ " 1581-03-28 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 3 | \n",
+ " Thursday | \n",
+ " Thursday | \n",
+ " 1200s | \n",
+ "
\n",
+ " \n",
+ " | 1714 | \n",
+ " 2418 | \n",
+ " Monday 20 Tevet 1520 | \n",
+ " Seleucid | \n",
+ " 1208-12-29 | \n",
+ " 1208-12-29 | \n",
+ " 1520-10-20 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 0 | \n",
+ " Monday | \n",
+ " Monday | \n",
+ " 1200s | \n",
+ "
\n",
+ " \n",
+ " | 1929 | \n",
+ " 2649 | \n",
+ " Sunday night, 25 Kislev 1444 | \n",
+ " Seleucid | \n",
+ " 1133 | \n",
+ " 1133 | \n",
+ " 1444-09-25 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 0 | \n",
+ " Monday | \n",
+ " Monday | \n",
+ " 1100s | \n",
+ "
\n",
+ " \n",
+ " | 2013 | \n",
+ " 2739 | \n",
+ " Wednesday 29th Elul 1354 | \n",
+ " Seleucid | \n",
+ " 1043-09-07 | \n",
+ " 1043-09-07 | \n",
+ " 1354-06-29 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 2 | \n",
+ " Wednesday | \n",
+ " Wednesday | \n",
+ " 1000s | \n",
+ "
\n",
+ " \n",
+ " | 3257 | \n",
+ " 4026 | \n",
+ " Wednesday night, 29 Tishrei 1541 | \n",
+ " Seleucid | \n",
+ " 1229-09-18 | \n",
+ " 1229-09-18 | \n",
+ " 1541-07-29 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 3 | \n",
+ " Thursday | \n",
+ " Thursday | \n",
+ " 1200s | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 29303 | \n",
+ " 34623 | \n",
+ " Sunday night, 20 Ṭevet 1578 | \n",
+ " Seleucid | \n",
+ " 1266/1267 | \n",
+ " 1266/1267 | \n",
+ " 1578-10-20 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 0 | \n",
+ " Monday | \n",
+ " Monday | \n",
+ " 1200s | \n",
+ "
\n",
+ " \n",
+ " | 29924 | \n",
+ " 35264 | \n",
+ " Wednesday 13 Ṭevet 1526 | \n",
+ " Seleucid | \n",
+ " 1214/1215 | \n",
+ " 1214/1215 | \n",
+ " 1526-10-13 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 2 | \n",
+ " Wednesday | \n",
+ " Wednesday | \n",
+ " 1200s | \n",
+ "
\n",
+ " \n",
+ " | 34008 | \n",
+ " 39564 | \n",
+ " Monday 16 Tevet 1339 | \n",
+ " Seleucid | \n",
+ " 1027-12-18 | \n",
+ " 1027-12-18 | \n",
+ " 1339-10-16 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 0 | \n",
+ " Monday | \n",
+ " Monday | \n",
+ " 1000s | \n",
+ "
\n",
+ " \n",
+ " | 34466 | \n",
+ " 40035 | \n",
+ " Monday 1st Iyyar 1437 | \n",
+ " Seleucid | \n",
+ " 1126-04-26 | \n",
+ " 1126-04-26 | \n",
+ " 1437-02-01 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 0 | \n",
+ " Monday | \n",
+ " Monday | \n",
+ " 1100s | \n",
+ "
\n",
+ " \n",
+ " | 34467 | \n",
+ " 40036 | \n",
+ " Friday 15 of Adar 1443 | \n",
+ " Seleucid | \n",
+ " 1132-03-04 | \n",
+ " 1132-03-04 | \n",
+ " 1443-12-15 | \n",
+ " day | \n",
+ " Legal document | \n",
+ " 4 | \n",
+ " Friday | \n",
+ " Friday | \n",
+ " 1100s | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
104 rows × 12 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pgpid doc_date_original doc_date_calendar \\\n",
+ "851 1377 Wednesday night, 28 Sivan 1581 Seleucid \n",
+ "1714 2418 Monday 20 Tevet 1520 Seleucid \n",
+ "1929 2649 Sunday night, 25 Kislev 1444 Seleucid \n",
+ "2013 2739 Wednesday 29th Elul 1354 Seleucid \n",
+ "3257 4026 Wednesday night, 29 Tishrei 1541 Seleucid \n",
+ "... ... ... ... \n",
+ "29303 34623 Sunday night, 20 Ṭevet 1578 Seleucid \n",
+ "29924 35264 Wednesday 13 Ṭevet 1526 Seleucid \n",
+ "34008 39564 Monday 16 Tevet 1339 Seleucid \n",
+ "34466 40035 Monday 1st Iyyar 1437 Seleucid \n",
+ "34467 40036 Friday 15 of Adar 1443 Seleucid \n",
+ "\n",
+ " doc_date_standard undate_standard undate_orig orig_date_precision \\\n",
+ "851 1270 1270 1581-03-28 day \n",
+ "1714 1208-12-29 1208-12-29 1520-10-20 day \n",
+ "1929 1133 1133 1444-09-25 day \n",
+ "2013 1043-09-07 1043-09-07 1354-06-29 day \n",
+ "3257 1229-09-18 1229-09-18 1541-07-29 day \n",
+ "... ... ... ... ... \n",
+ "29303 1266/1267 1266/1267 1578-10-20 day \n",
+ "29924 1214/1215 1214/1215 1526-10-13 day \n",
+ "34008 1027-12-18 1027-12-18 1339-10-16 day \n",
+ "34466 1126-04-26 1126-04-26 1437-02-01 day \n",
+ "34467 1132-03-04 1132-03-04 1443-12-15 day \n",
+ "\n",
+ " type undate_weekday undate_weekday_name orig_weekday century \n",
+ "851 Legal document 3 Thursday Thursday 1200s \n",
+ "1714 Legal document 0 Monday Monday 1200s \n",
+ "1929 Legal document 0 Monday Monday 1100s \n",
+ "2013 Legal document 2 Wednesday Wednesday 1000s \n",
+ "3257 Legal document 3 Thursday Thursday 1200s \n",
+ "... ... ... ... ... ... \n",
+ "29303 Legal document 0 Monday Monday 1200s \n",
+ "29924 Legal document 2 Wednesday Wednesday 1200s \n",
+ "34008 Legal document 0 Monday Monday 1000s \n",
+ "34466 Legal document 0 Monday Monday 1100s \n",
+ "34467 Legal document 4 Friday Friday 1100s \n",
+ "\n",
+ "[104 rows x 12 columns]"
+ ]
+ },
+ "execution_count": 42,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# get rough century (gregorian calendar)\n",
+ "weekday_dates['century'] = orig_dates_days.undate_orig.apply(lambda x: (\"%04d\" % x.earliest.year)[:2] + \"00s\")\n",
+ "\n",
+ "weekday_dates[['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard', 'undate_standard', 'undate_orig', 'century']].head()\n",
+ "weekday_dates"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "id": "eb99871e-d9a5-4211-9bd2-5a9acfe8face",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.Chart(...)"
+ ]
+ },
+ "execution_count": 38,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "\n",
+ "alt.Chart(weekday_dates[['undate_weekday', 'undate_weekday_name', 'pgpid', 'century']]).mark_rect().encode(\n",
+ " alt.X('undate_weekday_name', sort=days, title='weekday'),\n",
+ " alt.Y('century'),\n",
+ " alt.Color('count(pgpid)')\n",
+ ").properties(title='document frequency by weekday and century')\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cfd1e93b-1286-43d9-be44-34ba607435e1",
+ "metadata": {},
+ "source": [
+ "The weekday + century heatmap suggets we're more likely to have day-level precision dates from the 1700s than any other time period in the dataset."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2ec7d437-092f-47de-b60c-a1b72f45b4dd",
+ "metadata": {},
+ "source": [
+ "## Plot frequency by month and calendar"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "id": "08a58fcf-2b08-441b-9dc8-385bafeb88e6",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.FacetChart(...)"
+ ]
+ },
+ "execution_count": 39,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# what about heat map by month?\n",
+ "\n",
+ "# get numeric month\n",
+ "orig_dates_parsed['undate_month'] = orig_dates_parsed.undate_orig.apply(lambda x: x.month)\n",
+ "# orig_dates_parsed['undate_weekday_name'] = orig_dates_parsed.undate_weekday.apply(lambda x: days[x])\n",
+ "\n",
+ "has_month = orig_dates_parsed[orig_dates_parsed.undate_month.notna()]\n",
+ "\n",
+ "alt.Chart(has_month[['undate_month', 'pgpid', 'doc_date_calendar']]).mark_rect().encode(\n",
+ " alt.X('undate_month', title='month'),\n",
+ " alt.Color('count(pgpid)', title='# of documents')\n",
+ ").facet(\n",
+ " row=alt.Facet('doc_date_calendar', title=\"Original Calendar\")\n",
+ ").properties(title='Document frequency by month and calendar')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2ad489d5-483d-4280-a7d8-0090fdd2aa32",
+ "metadata": {},
+ "source": [
+ "That very light month 13 in the Hebrew and Seleucid calendars reflects the fact that the Hebrew calendar has a leap _month_."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "id": "a7a16c53-6f01-4457-9458-4fcf80a35c51",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "doc_date_calendar\n",
+ "Seleucid 1196\n",
+ "Anno Mundi 903\n",
+ "Hijrī 516\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 40,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "has_month.doc_date_calendar.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "id": "65bce74e-67b7-48df-9f7f-a6f264af4f11",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1593, 38)"
+ ]
+ },
+ "execution_count": 41,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "orig_dates_days[orig_dates_days.undate_weekday_name.notna()].shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "id": "ac940883-e00e-4dde-8339-95a1b733f6f3",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/var/folders/mb/6qm4h4yx3yqdy2bv2sjyp4z00000gp/T/ipykernel_69693/2787254306.py:3: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " orig_dates_days['undate_month'] = orig_dates_days.undate_orig.apply(lambda x: x.month)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.FacetChart(...)"
+ ]
+ },
+ "execution_count": 42,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# weekday frequency by month?\n",
+ "\n",
+ "orig_dates_days['undate_month'] = orig_dates_days.undate_orig.apply(lambda x: x.month)\n",
+ "\n",
+ "alt.Chart(orig_dates_days[['undate_weekday', 'undate_weekday_name', 'pgpid', 'undate_month', 'doc_date_calendar']]).mark_rect().encode(\n",
+ " alt.X('undate_weekday_name', sort=days, title='weekday'),\n",
+ " alt.Y('undate_month', title=\"month\"),\n",
+ " alt.Color('count(pgpid)')\n",
+ ").facet(\n",
+ " column=alt.Facet('doc_date_calendar', title=\"Original Calendar\")\n",
+ ").properties(title='Document frequency by weekday and month (1,557 documents)')\n"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.7"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/pyproject.toml b/pyproject.toml
index 2dc6515..fcebbbd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,7 +9,12 @@ readme = "README.md"
license = { text = "Apache-2" }
requires-python = ">= 3.10"
dynamic = ["version"]
-dependencies = ["lark[interegular]", "numpy", "convertdate", "strenum; python_version < '3.11'"]
+dependencies = [
+ "lark[interegular]",
+ "numpy",
+ "convertdate",
+ "strenum; python_version < '3.11'",
+]
authors = [
{ name = "Rebecca Sutton Koeser" },
{ name = "Cole Crawford" },
@@ -29,7 +34,7 @@ keywords = [
"digital-humanities",
]
classifiers = [
- "Development Status :: 3 - Alpha",
+ "Development Status :: 4 - Beta",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
@@ -49,7 +54,7 @@ classifiers = [
[project.optional-dependencies]
docs = ["sphinx>=7.0.0", "alabaster", "myst-parser", "myst-parser[linkify]"]
test = ["pytest>=7.2", "pytest-ordering", "pytest-cov"]
-notebooks = ["jupyterlab", "pandas", "treon"]
+notebooks = ["jupyterlab", "pandas", "treon", "altair"]
check = ["undate[docs]", "undate[notebooks]", "mypy", "ruff"]
dev = [
"pre-commit>=2.20.0",
diff --git a/src/undate/__init__.py b/src/undate/__init__.py
index 0976d0e..de0afb1 100644
--- a/src/undate/__init__.py
+++ b/src/undate/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.4.0"
+__version__ = "0.5.0"
from undate.date import DatePrecision
from undate.undate import Undate, Calendar
diff --git a/src/undate/converters/base.py b/src/undate/converters/base.py
index 04db129..1cf1b6d 100644
--- a/src/undate/converters/base.py
+++ b/src/undate/converters/base.py
@@ -48,6 +48,8 @@
from functools import cache
from typing import Dict, Type
+from undate.date import Date
+
logger = logging.getLogger(__name__)
@@ -58,6 +60,10 @@ class BaseDateConverter:
#: Converter name. Subclasses must define a unique name.
name: str = "Base Converter"
+ # provisional...
+ LEAP_YEAR = 0
+ NON_LEAP_YEAR = 0
+
def parse(self, value: str):
"""
Parse a string and return an :class:`~undate.undate.Undate` or
@@ -142,6 +148,16 @@ class BaseCalendarConverter(BaseDateConverter):
#: Converter name. Subclasses must define a unique name.
name: str = "Base Calendar Converter"
+ #: arbitrary known non-leap year
+ NON_LEAP_YEAR: int
+ #: arbitrary known leap year
+ LEAP_YEAR: int
+
+ # minimum year for this calendar, if there is one
+ MIN_YEAR: None | int = None
+ # maximum year for this calendar, if there is one
+ MAX_YEAR: None | int = None
+
def min_month(self) -> int:
"""Smallest numeric month for this calendar."""
raise NotImplementedError
@@ -162,6 +178,27 @@ def max_day(self, year: int, month: int) -> int:
"""maximum numeric day for the specified year and month in this calendar"""
raise NotImplementedError
+ def days_in_year(self, year: int) -> int:
+ """Number of days in the specified year in this calendar. The default implementation
+ uses min and max month and max day methods along with Gregorian conversion method
+ to calculate the number of days in the specified year.
+ """
+ year_start = Date(*self.to_gregorian(year, self.min_month(), 1))
+ last_month = self.max_month(year)
+ year_end = Date(
+ *self.to_gregorian(year, last_month, self.max_day(year, last_month))
+ )
+ # add 1 because the difference doesn't include the end point
+ return (year_end - year_start).days + 1
+
+ def representative_years(self, years: None | list[int] = None) -> list[int]:
+ """Returns a list of representative years within the specified list.
+ Result should include one for each type of variant year for this
+ calendar (e.g., leap year and non-leap year). If no years are specified,
+ returns a list of representative years for the current calendar.
+ """
+ raise NotImplementedError
+
def to_gregorian(self, year, month, day) -> tuple[int, int, int]:
"""Convert a date for this calendar specified by numeric year, month, and day,
into the Gregorian equivalent date. Should return a tuple of year, month, day.
diff --git a/src/undate/converters/calendars/__init__.py b/src/undate/converters/calendars/__init__.py
index a43a270..5836b2f 100644
--- a/src/undate/converters/calendars/__init__.py
+++ b/src/undate/converters/calendars/__init__.py
@@ -1,5 +1,11 @@
from undate.converters.calendars.gregorian import GregorianDateConverter
from undate.converters.calendars.hebrew import HebrewDateConverter
from undate.converters.calendars.islamic import IslamicDateConverter
+from undate.converters.calendars.seleucid import SeleucidDateConverter
-__all__ = ["GregorianDateConverter", "HebrewDateConverter", "IslamicDateConverter"]
+__all__ = [
+ "GregorianDateConverter",
+ "HebrewDateConverter",
+ "IslamicDateConverter",
+ "SeleucidDateConverter",
+]
diff --git a/src/undate/converters/calendars/gregorian.py b/src/undate/converters/calendars/gregorian.py
index 5a1d2dc..b3b103b 100644
--- a/src/undate/converters/calendars/gregorian.py
+++ b/src/undate/converters/calendars/gregorian.py
@@ -1,4 +1,4 @@
-from calendar import monthrange
+from calendar import monthrange, isleap
from undate.converters.base import BaseCalendarConverter
@@ -13,8 +13,10 @@ class GregorianDateConverter(BaseCalendarConverter):
#: calendar
calendar_name: str = "Gregorian"
- #: known non-leap year
+ #: arbitrary known non-leap year
NON_LEAP_YEAR: int = 2022
+ #: arbitrary known leap year
+ LEAP_YEAR: int = 2024
def min_month(self) -> int:
"""First month for the Gregorian calendar."""
@@ -38,10 +40,38 @@ def max_day(self, year: int, month: int) -> int:
_, max_day = monthrange(year, month)
else:
# if year and month are unknown, return maximum possible
+ # TODO: should this return an IntervalRange?
max_day = 31
return max_day
+ def representative_years(self, years: None | list[int] = None) -> list[int]:
+ """Takes a list of years and returns a subset with one leap year and one non-leap year.
+ If no years are specified, returns a known leap year and non-leap year.
+ """
+
+ # if years is unset or list is empty
+ if not years:
+ return [self.LEAP_YEAR, self.NON_LEAP_YEAR]
+
+ found_leap = False
+ found_non_leap = False
+ rep_years = []
+ for year in years:
+ if isleap(year):
+ if not found_leap:
+ found_leap = True
+ rep_years.append(year)
+ else:
+ if not found_non_leap:
+ found_non_leap = True
+ rep_years.append(year)
+ # stop as soon as we've found one example of each type of year
+ if found_leap and found_non_leap:
+ break
+
+ return rep_years
+
def to_gregorian(self, year, month, day) -> tuple[int, int, int]:
"""Convert to Gregorian date. This returns the specified by year, month,
and day unchanged, but is provided for consistency since all calendar
diff --git a/src/undate/converters/calendars/hebrew/converter.py b/src/undate/converters/calendars/hebrew/converter.py
index d540021..165d67e 100644
--- a/src/undate/converters/calendars/hebrew/converter.py
+++ b/src/undate/converters/calendars/hebrew/converter.py
@@ -21,6 +21,11 @@ class HebrewDateConverter(BaseCalendarConverter):
name: str = "Hebrew"
calendar_name: str = "Anno Mundi"
+ #: arbitrary known non-leap year; 4816 is a non-leap year with 353 days (minimum possible)
+ NON_LEAP_YEAR: int = 4816
+ #: arbitrary known leap year; 4837 is a leap year with 385 days (maximum possible)
+ LEAP_YEAR: int = 4837
+
def __init__(self):
self.transformer = HebrewDateTransformer()
@@ -47,6 +52,36 @@ def max_day(self, year: int, month: int) -> int:
# NOTE: unreleased v2.4.1 of convertdate standardizes month_days to month_length
return hebrew.month_days(year, month)
+ def days_in_year(self, year: int) -> int:
+ """the number of days in the specified year for this calendar"""
+ return int(hebrew.year_days(year))
+
+ def representative_years(self, years: None | list[int] = None) -> list[int]:
+ """Takes a list of years and returns a subset with all possible variations in number of days.
+ If no years are specified, returns ...
+ """
+
+ year_lengths = set()
+ max_year_lengths = 6 # there are 6 different possible length years
+
+ # if years is unset or list is empty
+ if not years:
+ # NOTE: this does not cover all possible lengths, but should cover min/max
+ return [self.LEAP_YEAR, self.NON_LEAP_YEAR]
+
+ rep_years = []
+ for year in years:
+ days = self.days_in_year(year)
+ if days not in year_lengths:
+ year_lengths.add(days)
+ rep_years.append(year)
+
+ # stop if we find one example of each type of year
+ if len(year_lengths) == max_year_lengths:
+ break
+
+ return rep_years
+
def to_gregorian(self, year: int, month: int, day: int) -> tuple[int, int, int]:
"""Convert a Hebrew date, specified by year, month, and day,
to the Gregorian equivalent date. Returns a tuple of year, month, day.
diff --git a/src/undate/converters/calendars/hebrew/hebrew.lark b/src/undate/converters/calendars/hebrew/hebrew.lark
index b55ec3f..6f4244c 100644
--- a/src/undate/converters/calendars/hebrew/hebrew.lark
+++ b/src/undate/converters/calendars/hebrew/hebrew.lark
@@ -3,7 +3,7 @@
// only support day month year format for now
// parser requires numeric day and year to be distinguished based on order
-hebrew_date: day month year | month year | year
+hebrew_date: weekday? day month comma? year | month year | year
// TODO: handle date ranges?
@@ -27,10 +27,14 @@ month: month_1
| month_10
| month_11
| month_12
- | month_13
+ | month_13
// months have 29 or 30 days; we do not expect leading zeroes
day: /[1-9]/ | /[12][0-9]/ | /30/
+comma: ","
+weekday: ("Monday" | "Tuesday" | "Wednesday" | "Thursday" | "Friday" | "Saturday" | "Sunday") comma?
+
+
// months, in order; from convertdate list
// with variants from Princeton Geniza Project
// support matching with and without accents
@@ -43,11 +47,13 @@ month_5: "Av"
month_6: "Elul"
// Tishrei or Tishri
month_7: /Tishre?i/
-month_8: "Heshvan"
+// Heshvan, Ḥeshvan, Marḥeshvan
+month_8: /(Mar)?[ḤHḥ]eshvan/
month_9: "Kislev"
// Tevet or Teveth
month_10: /[ṬT]eveth?/
-month_11: "Shevat"
+// Shevat or Shevaṭ
+month_11: /Sheva[tṭ]/
// Adar I or Adar
month_12: /Adar( I)?/
// Adar II or Adar Bet
diff --git a/src/undate/converters/calendars/hebrew/transformer.py b/src/undate/converters/calendars/hebrew/transformer.py
index 48e8b20..8880434 100644
--- a/src/undate/converters/calendars/hebrew/transformer.py
+++ b/src/undate/converters/calendars/hebrew/transformer.py
@@ -13,6 +13,8 @@ class HebrewDateTransformer(Transformer):
"""Transform a Hebrew date parse tree and return an Undate or
UndateInterval."""
+ calendar = Calendar.HEBREW
+
def hebrew_date(self, items):
parts = {}
for child in items:
@@ -22,9 +24,9 @@ def hebrew_date(self, items):
value = int(child.children[0])
parts[str(child.data)] = value
- # initialize and return an undate with islamic year, month, day and
- # islamic calendar
- return HebrewUndate(**parts)
+ # initialize and return an undate with year, month, day and
+ # configured calendar (hebrew by default)
+ return Undate(**parts, calendar=self.calendar)
# year translation is not needed since we want a tree with name year
# this is equivalent to a no-op
diff --git a/src/undate/converters/calendars/islamic/converter.py b/src/undate/converters/calendars/islamic/converter.py
index c658c90..67f2a64 100644
--- a/src/undate/converters/calendars/islamic/converter.py
+++ b/src/undate/converters/calendars/islamic/converter.py
@@ -21,6 +21,16 @@ class IslamicDateConverter(BaseCalendarConverter):
name: str = "Islamic"
calendar_name: str = "Islamic"
+ #: arbitrary known non-leap year
+ NON_LEAP_YEAR: int = 1457
+ #: arbitrary known leap year
+ LEAP_YEAR: int = 1458
+
+ # minimum year for islamic calendar is 1 AH, does not go negative
+ MIN_YEAR: None | int = 1
+ # convertdate gives a month 34 for numpy max year 2.5^16, so scale it back a bit
+ MAX_YEAR = int(2.5e12)
+
def __init__(self):
self.transformer = IslamicDateTransformer()
@@ -36,10 +46,37 @@ def max_month(self, year: int) -> int:
"""maximum numeric month for this calendar"""
return 12
+ def representative_years(self, years: None | list[int] = None) -> list[int]:
+ """Takes a list of years and returns a subset with one leap year and one non-leap year.
+ If no years are specified, returns a known leap year and non-leap year.
+ """
+
+ # if years is unset or list is empty
+ if not years:
+ return [self.LEAP_YEAR, self.NON_LEAP_YEAR]
+ found_leap = False
+ found_non_leap = False
+ rep_years = []
+ for year in years:
+ if islamic.leap(year):
+ if not found_leap:
+ found_leap = True
+ rep_years.append(year)
+ else:
+ if not found_non_leap:
+ found_non_leap = True
+ rep_years.append(year)
+ # stop as soon as we've found one example of each type of year
+ if found_leap and found_non_leap:
+ break
+
+ return rep_years
+
def to_gregorian(self, year: int, month: int, day: int) -> tuple[int, int, int]:
"""Convert a Hijri date, specified by year, month, and day,
to the Gregorian equivalent date. Returns a tuple of year, month, day.
"""
+ # NOTE: this results in weird numbers for months when year gets sufficiently high
return islamic.to_gregorian(year, month, day)
def parse(self, value: str) -> Union[Undate, UndateInterval]:
diff --git a/src/undate/converters/calendars/islamic/islamic.lark b/src/undate/converters/calendars/islamic/islamic.lark
index 3ad59a5..1e4940b 100644
--- a/src/undate/converters/calendars/islamic/islamic.lark
+++ b/src/undate/converters/calendars/islamic/islamic.lark
@@ -3,7 +3,7 @@
// only support day month year format for now
// parser requires numeric day and year to be distinguished based on order
-islamic_date: day month year | month year | year
+islamic_date: weekday? day month year | month year | year
// TODO: handle date ranges?
@@ -13,6 +13,7 @@ islamic_date: day month year | month year | year
year: /\d+/
+
// months
month: month_1
| month_2
@@ -29,6 +30,10 @@ month: month_1
// months have 29 or 30 days; we do not expect leading zeroes
day: /[1-9]/ | /[12][0-9]/ | /30/
+
+comma: ","
+weekday: ("Monday" | "Tuesday" | "Wednesday" | "Thursday" | "Friday" | "Saturday" | "Sunday") comma?
+
// months, in order; from convertdate list
// with variants from Princeton Geniza Project
// support matching with and without accents
@@ -42,7 +47,7 @@ month_4: /Rab[īi][ʿ'] (ath-Th[āa]n[īi]|II)/
// Jumādā al-ʾAwwal or Jumādā I
month_5: /Jum[āa]d[āa] (al-[ʾ`]Awwal|I)/
// Jumādā ath-Thāniya or Jumādā II
-month_6: /Jum[āa]d[āa] (ath-Th[āa]niyah|II)/
+month_6: /Jum[āa][dḍ][āa] (ath-Th[āa]niyah|II)/
month_7: "Rajab"
// Shaʿbān
month_8: /Sha[ʿ']b[āa]n/
diff --git a/src/undate/converters/calendars/seleucid.py b/src/undate/converters/calendars/seleucid.py
new file mode 100644
index 0000000..ae54965
--- /dev/null
+++ b/src/undate/converters/calendars/seleucid.py
@@ -0,0 +1,28 @@
+from undate.converters.calendars import HebrewDateConverter
+from undate.undate import Calendar
+
+
+class SeleucidDateConverter(HebrewDateConverter):
+ #: offset for Seleucid calendar: Seleucid year + 3449 = Anno Mundi year
+ SELEUCID_OFFSET = 3449
+
+ #: converter name: Seleucid
+ name: str = "Seleucid"
+ calendar_name: str = "Seleucid"
+
+ def __init__(self):
+ super().__init__()
+ # override hebrew calendar to initialize undates with seleucid
+ # calendar; this triggers Seleucid calendar to_gregorian method use
+ self.transformer.calendar = Calendar.SELEUCID
+
+ def to_gregorian(self, year: int, month: int, day: int) -> tuple[int, int, int]:
+ """Convert a Seleucid date, specified by year, month, and day,
+ to the Gregorian equivalent date. Uses hebrew calendar conversion
+ logic with :attr:`SELEUCID_OFFSET`. Returns a tuple of year, month, day.
+ """
+ return super().to_gregorian(year + self.SELEUCID_OFFSET, month, day)
+
+ def days_in_year(self, year: int) -> int:
+ """the number of days in the specified year for this calendar"""
+ return super().days_in_year(year + self.SELEUCID_OFFSET)
diff --git a/src/undate/date.py b/src/undate/date.py
index 27f6efa..4e9eddc 100644
--- a/src/undate/date.py
+++ b/src/undate/date.py
@@ -1,7 +1,9 @@
from enum import IntEnum
+from dataclasses import dataclass, replace
+import operator
# Pre 3.10 requires Union for multiple types, e.g. Union[int, None] instead of int | None
-from typing import Optional, Union
+from typing import Optional, Union, Iterable
import numpy as np
@@ -29,6 +31,144 @@ def days(self) -> int:
return int(self.astype("datetime64[D]").astype("int"))
+@dataclass
+class UnInt:
+ """An uncertain integer intended for use with uncertain durations (:class:`UnDelta`),
+ to convey a range of possible integer values between an upper
+ and lower bound (both inclusive). Supports comparison, addition and subtraction,
+ checking if a value is included in the range, and iterating over numbers
+ included in the range.
+ """
+
+ lower: int
+ upper: int
+
+ def __post_init__(self):
+ # validate that lower value is less than upper
+ if not self.lower < self.upper:
+ raise ValueError(
+ f"Lower value ({self.lower}) must be less than upper ({self.upper})"
+ )
+
+ def __iter__(self) -> Iterable:
+ # yield all integers in range from lower to upper, inclusive
+ yield from range(self.lower, self.upper + 1)
+
+ def __gt__(self, other: object) -> bool:
+ match other:
+ case int():
+ return self.lower > other
+ case UnInt():
+ return self.lower > other.upper
+ case _:
+ return NotImplemented
+
+ def __lt__(self, other: object) -> bool:
+ match other:
+ case int():
+ return self.upper < other
+ case UnInt():
+ return self.upper < other.lower
+ case _:
+ return NotImplemented
+
+ def __contains__(self, other: object) -> bool:
+ match other:
+ case int():
+ return other >= self.lower and other <= self.upper
+ case UnInt():
+ return other.lower >= self.lower and other.upper <= self.upper
+ case _:
+ # unsupported type: return false
+ return False
+
+ def _replace_with(self, other_lower, other_upper, op):
+ """Create and return a new instance of UnInt using the specified
+ operator (e.g. add, subtract) and other values to modify the values in
+ the current UnInt instance."""
+ return replace(
+ self, lower=op(self.lower, other_lower), upper=op(self.upper, other_upper)
+ )
+
+ def __add__(self, other: object) -> "UnInt":
+ match other:
+ case int():
+ # increase both values by the added amount
+ add_values = (other, other)
+ case UnInt():
+ # add other lower value to current lower and other upper
+ # to current upper to include the largest range of possible values
+ # (when calculating with uncertain values, the uncertainty increases)
+ add_values = (other.lower, other.upper)
+ case _:
+ return NotImplemented
+
+ return self._replace_with(*add_values, operator.add)
+
+ def __sub__(self, other) -> "UnInt":
+ match other:
+ case int():
+ # decrease both values by the subtracted amount
+ sub_values = (other, other)
+ case UnInt():
+ # to determine the largest range of possible values,
+ # subtract the other upper value from current lower
+ # and other lower value from current upper
+ sub_values = (other.upper, other.lower)
+ case _:
+ return NotImplemented
+
+ return self._replace_with(*sub_values, operator.sub)
+
+
+@dataclass
+class UnDelta:
+ """
+ An uncertain timedelta, for durations where the number of days is uncertain.
+ Initialize with a list of possible durations in days as integers, which are used
+ to calculate a value for duration in :attr:`days` as an
+ instance of :class:`UnInt`.
+ """
+
+ # NOTE: we will probably need other timedelta-like logic here besides days...
+
+ #: possible durations days, as an instance of :class:`UnInt`
+ days: UnInt
+
+ def __init__(self, *days: int):
+ if len(days) < 2:
+ raise ValueError(
+ "Must specify at least two values for an uncertain duration"
+ )
+ self.days = UnInt(min(days), max(days))
+
+ def __repr__(self):
+ # customize string representation for simpler notation; default
+ # specifies full UnInt initialization with upper and lower keywords
+ return f"{self.__class__.__name__}(days=[{self.days.lower},{self.days.upper}])"
+
+ def __eq__(self, other: object) -> bool:
+ # is an uncertain duration ever *equal* another, even if the values are the same?
+ # for now, make the assumption that we only want identity equality
+ # and not value equality; perhaps in future we can revisit
+ # or add functions to check value equality / equivalence / similarity
+ return other is self
+
+ def __lt__(self, other: object) -> bool:
+ match other:
+ case Timedelta() | UnDelta():
+ return self.days < other.days
+ case _:
+ return NotImplemented
+
+ def __gt__(self, other: object) -> bool:
+ match other:
+ case Timedelta() | UnDelta():
+ return self.days > other.days
+ case _:
+ return NotImplemented
+
+
#: timedelta for single day
ONE_DAY = Timedelta(1) # ~ equivalent to datetime.timedelta(days=1)
#: timedelta for a single year (non-leap year)
@@ -104,6 +244,27 @@ def day(self) -> Optional[int]:
return int(str(self.astype("datetime64[D]")).split("-")[-1])
return None
+ @property
+ def weekday(self) -> Optional[int]:
+ """Equivalent to :meth:`datetime.date.weekday`; returns day of week as an
+ integer where Monday is 0 and Sunday is 6. Only supported for dates
+ with date unit in days.
+ """
+ # only return a weekday if date unit is in days
+ if self.dtype == "datetime64[D]":
+ # calculate based on difference between current day and week start
+ # numpy datetime weeks start on thursdays - presumably since
+ # unix epoch day zero was a thursday...
+
+ # implementation inspired in part by https://stackoverflow.com/a/54264187
+
+ thursday_week = self.astype("datetime64[W]")
+ days_from_thursday = (self - thursday_week).astype(int)
+ # if monday is 0, thursday is 3
+ return (days_from_thursday + 3) % 7
+
+ return None
+
def __sub__(self, other):
# modify to conditionally return a timedelta object instead of a
# Date object with dtype timedelta64[D] (default behavior)
diff --git a/src/undate/undate.py b/src/undate/undate.py
index be4454a..e6561bf 100644
--- a/src/undate/undate.py
+++ b/src/undate/undate.py
@@ -19,8 +19,8 @@
# Pre 3.10 requires Union for multiple types, e.g. Union[int, None] instead of int | None
from typing import Dict, Optional, Union
-from undate.converters.base import BaseDateConverter
-from undate.date import ONE_DAY, ONE_MONTH_MAX, Date, DatePrecision, Timedelta
+from undate.converters.base import BaseCalendarConverter, BaseDateConverter
+from undate.date import ONE_DAY, Date, DatePrecision, Timedelta, UnDelta
class Calendar(StrEnum):
@@ -29,12 +29,22 @@ class Calendar(StrEnum):
GREGORIAN = auto()
HEBREW = auto()
ISLAMIC = auto()
+ SELEUCID = auto()
@staticmethod
- def get_converter(calendar):
+ def get_converter(calendar) -> BaseCalendarConverter:
# calendar converter must be available with a name matching
# the title-case name of the calendar enum entry
- converter_cls = BaseDateConverter.available_converters()[calendar.value.title()]
+ try:
+ converter_cls = BaseDateConverter.available_converters()[
+ calendar.value.title()
+ ]
+ except KeyError as err:
+ raise ValueError(f"Unknown calendar '{calendar}'") from err
+ if not issubclass(converter_cls, BaseCalendarConverter):
+ raise ValueError(
+ f"Requested converter '{calendar.value.title()}' is not a CalendarConverter"
+ )
return converter_cls()
@@ -96,7 +106,6 @@ def __init__(
if calendar is not None:
self.set_calendar(calendar)
self.calendar_converter = Calendar.get_converter(self.calendar)
-
self.calculate_earliest_latest(year, month, day)
if converter is None:
@@ -124,10 +133,11 @@ def calculate_earliest_latest(self, year, month, day):
min_year = int(str(year).replace(self.MISSING_DIGIT, "0"))
max_year = int(str(year).replace(self.MISSING_DIGIT, "9"))
else:
- # use the configured min/max allowable years if we
- # don't have any other bounds
- min_year = self.MIN_ALLOWABLE_YEAR
- max_year = self.MAX_ALLOWABLE_YEAR
+ # if we don't have any other bounds,
+ # use calendar-specific min year if there is one, otherwise use
+ # the configured min/max allowable years
+ min_year = self.calendar_converter.MIN_YEAR or self.MIN_ALLOWABLE_YEAR
+ max_year = self.calendar_converter.MAX_YEAR or self.MAX_ALLOWABLE_YEAR
# if month is passed in as a string but completely unknown,
# treat as unknown/none (date precision already set in init)
@@ -166,7 +176,7 @@ def calculate_earliest_latest(self, year, month, day):
else:
# if we have no day or partial day, calculate min / max
min_day = 1 # is min day ever anything other than 1 ?
- rel_year = year if year and isinstance(year, int) else None
+ rel_year = year if year and isinstance(year, int) else max_year
# use month if it is an integer; otherwise use previusly determined
# max month (which may not be 12 depending if partially unknown)
rel_month = month if month and isinstance(month, int) else latest_month
@@ -192,9 +202,12 @@ def calculate_earliest_latest(self, year, month, day):
)
def set_calendar(self, calendar: Union[str, Calendar]):
+ """Find calendar by name if passed as string and set on the object.
+ Only intended for use at initialization time; use :meth:`as_calendar`
+ to change calendar."""
if calendar is not None:
# if not passed as a Calendar instance, do a lookup
- if not isinstance(calendar, Calendar):
+ if isinstance(calendar, str):
# look for calendar by upper-case name
try:
calendar = Calendar[calendar.upper()]
@@ -202,6 +215,19 @@ def set_calendar(self, calendar: Union[str, Calendar]):
raise ValueError(f"Calendar `{calendar}` is not supported") from err
self.calendar = calendar
+ def as_calendar(self, calendar: Union[str, Calendar]):
+ """Return a new :class:`Undate` object with the same year, month, day, and labels
+ used to initialize the current object, but with a different calendar. Note that this
+ does NOT do calendar conversion, but reinterprets current numeric year, month, day values
+ according to the new calendar."""
+ return Undate(
+ year=self.initial_values.get("year"),
+ month=self.initial_values.get("month"),
+ day=self.initial_values.get("day"),
+ label=self.label,
+ calendar=calendar,
+ )
+
def __str__(self) -> str:
# if any portion of the date is partially known, construct
# pseudo ISO8601 format here, since ISO8601 doesn't support unknown digits
@@ -242,7 +268,7 @@ def format(self, format) -> str:
"""format this undate as a string using the specified format;
for now, only supports named converters"""
converter_cls = BaseDateConverter.available_converters().get(format, None)
- if converter_cls:
+ if converter_cls is not None:
# NOTE: some parsers may return intervals; is that ok here?
return converter_cls().to_string(self)
@@ -319,8 +345,12 @@ def __lt__(self, other: object) -> bool:
# (e.g., single date within the same year)
# comparison for those cases is not currently supported
elif other in self or self in other:
+ # sort by precision, most precise first
+ by_precision = sorted(
+ [self, other], key=lambda x: x.precision, reverse=True
+ )
raise NotImplementedError(
- "Can't compare when one date falls within the other"
+ f"Can't compare when one date ({by_precision[0]}) falls within the other ({by_precision[1]})"
)
# NOTE: unsupported comparisons are supposed to return NotImplemented
# However, doing that in this case results in a confusing TypeError!
@@ -397,7 +427,9 @@ def is_known(self, part: str) -> bool:
return isinstance(self.initial_values[part], int)
def is_partially_known(self, part: str) -> bool:
+ # TODO: should XX / XXXX really be considered partially known? other code seems to assume this, so we'll preserve the behavior
return isinstance(self.initial_values[part], str)
+ # and self.initial_values[part].replace(self.MISSING_DIGIT, "") != ""
@property
def year(self) -> Optional[str]:
@@ -405,7 +437,7 @@ def year(self) -> Optional[str]:
year = self._get_date_part("year")
if year:
return f"{year:0>4}"
- # if value is unset but date precision is month or greater, return unknown month
+ # if value is unset but date precision is year or greater, return unknown year
elif self.precision >= DatePrecision.YEAR:
return self.MISSING_DIGIT * 4
return None
@@ -439,43 +471,99 @@ def _get_date_part(self, part: str) -> Optional[str]:
value = self.initial_values.get(part)
return str(value) if value else None
- def duration(self) -> Timedelta:
+ @property
+ def possible_years(self) -> list[int] | range:
+ """A list or range of possible years for this date in the original calendar.
+ Returns a list with a single year for dates with fully-known years."""
+ if self.known_year:
+ return [self.earliest.year]
+
+ step = 1
+ if (
+ self.is_partially_known("year")
+ and str(self.year).replace(self.MISSING_DIGIT, "") != ""
+ ):
+ # determine the smallest step size for the missing digit
+ earliest_year = int(str(self.year).replace(self.MISSING_DIGIT, "0"))
+ latest_year = int(str(self.year).replace(self.MISSING_DIGIT, "9"))
+ missing_digit_place = len(str(self.year)) - str(self.year).rfind(
+ self.MISSING_DIGIT
+ )
+ # convert place to 1, 10, 100, 1000, etc.
+ step = 10 ** (missing_digit_place - 1)
+ return range(earliest_year, latest_year + 1, step)
+
+ # otherwise, year is fully unknown
+ # returning range from min year to max year is not useful in any scenario!
+ raise ValueError(
+ "Possible years cannot be returned for completely unknown year"
+ )
+
+ @property
+ def representative_years(self) -> list[int]:
+ """A list of representative years for this date."""
+ try:
+ # todo: filter by calendar to minimum needed
+ try:
+ return self.calendar_converter.representative_years(
+ list(self.possible_years)
+ )
+ except NotImplementedError:
+ # if calendar converter does not support representative years, return all years
+ return list(self.possible_years)
+ except ValueError:
+ return [
+ self.calendar_converter.LEAP_YEAR,
+ self.calendar_converter.NON_LEAP_YEAR,
+ ]
+
+ def duration(self) -> Timedelta | UnDelta:
"""What is the duration of this date?
Calculate based on earliest and latest date within range,
taking into account the precision of the date even if not all
parts of the date are known. Note that durations are inclusive
(i.e., a closed interval) and include both the earliest and latest
- date rather than the difference between them."""
+ date rather than the difference between them. Returns a :class:`undate.date.Timedelta` when
+ possible, and an :class:`undate.date.UnDelta` when the duration is uncertain."""
# if precision is a single day, duration is one day
# no matter when it is or what else is known
if self.precision == DatePrecision.DAY:
return ONE_DAY
+ possible_max_days = set()
+
# if precision is month and year is unknown,
# calculate month duration within a single year (not min/max)
if self.precision == DatePrecision.MONTH:
- latest = self.latest
- if not self.known_year:
- # if year is unknown, calculate month duration in
- # a single year
- latest = Date(self.earliest.year, self.latest.month, self.latest.day)
-
- # latest = datetime.date(
- # self.earliest.year, self.latest.month, self.latest.day
- # )
- delta = latest - self.earliest + ONE_DAY
- # month duration can't ever be more than 31 days
- # (could we ever know if it's smaller?)
-
- # if granularity == month but not known month, duration = 31
- if delta.astype(int) > 31:
- return ONE_MONTH_MAX
- return delta
-
- # otherwise, calculate based on earliest/latest range
-
- # subtract earliest from latest and add a day to count start day
+ # for every possible month and year, get max days for that month,
+ # appease mypy, which says month values could be None here;
+ # Date object allows optional month, but earliest/latest initialization
+ # should always be day-precision dates
+ if self.earliest.month is not None and self.latest.month is not None:
+ for possible_month in range(self.earliest.month, self.latest.month + 1):
+ for year in self.representative_years:
+ possible_max_days.add(
+ self.calendar_converter.max_day(year, possible_month)
+ )
+
+ # if precision is year but year is unknown, return an uncertain delta
+ elif self.precision == DatePrecision.YEAR:
+ # this is currently hebrew-specific due to the way the start/end of year wraps for that calendar
+ # with contextlib.suppress(NotImplementedError):
+ possible_max_days = {
+ self.calendar_converter.days_in_year(y)
+ for y in self.representative_years
+ }
+
+ # if there is more than one possible value for number of days
+ # due to range including lear year / non-leap year, return an uncertain delta
+ if possible_max_days:
+ if len(possible_max_days) > 1:
+ return UnDelta(*possible_max_days)
+ return Timedelta(possible_max_days.pop())
+
+ # otherwise, subtract earliest from latest and add a day to include start day in the count
return self.latest - self.earliest + ONE_DAY
def _missing_digit_minmax(
diff --git a/tests/test_converters/test_base.py b/tests/test_converters/test_base.py
index a4ac52d..6265c15 100644
--- a/tests/test_converters/test_base.py
+++ b/tests/test_converters/test_base.py
@@ -91,3 +91,5 @@ def test_not_implemented(self):
BaseCalendarConverter().max_day(1900, 12)
with pytest.raises(NotImplementedError):
BaseCalendarConverter().to_gregorian(1900, 12, 31)
+ with pytest.raises(NotImplementedError):
+ BaseCalendarConverter().representative_years([1900, 1901])
diff --git a/tests/test_converters/test_calendars/test_gregorian.py b/tests/test_converters/test_calendars/test_gregorian.py
new file mode 100644
index 0000000..e0bf5ef
--- /dev/null
+++ b/tests/test_converters/test_calendars/test_gregorian.py
@@ -0,0 +1,40 @@
+from undate.converters.calendars import GregorianDateConverter
+
+
+class TestGregorianDateConverter:
+ def test_to_gregorian(self):
+ converter = GregorianDateConverter()
+ # conversion is a no-op, returns values unchanged
+ assert converter.to_gregorian(2025, 6, 15) == (2025, 6, 15)
+
+ def test_min_month(self):
+ assert GregorianDateConverter().min_month() == 1
+
+ def test_max_month(self):
+ assert GregorianDateConverter().max_month(2025) == 12
+
+ def test_max_day(self):
+ converter = GregorianDateConverter()
+ assert converter.max_day(2025, 1) == 31
+ assert converter.max_day(2025, 2) == 28
+ assert converter.max_day(converter.LEAP_YEAR, 2) == 29
+ assert converter.max_day(2025, 12) == 31
+
+ def test_representative_years(self):
+ converter = GregorianDateConverter()
+ # single year is not filtered
+ assert converter.representative_years([2025]) == [2025]
+ # multiple non-leap years, returns just the first
+ assert converter.representative_years([2025, 2026]) == [2025]
+ # next leap year is 2028; returns first leap year and first non-leap year, in input order
+ assert converter.representative_years([2025, 2026, 2028, 2029]) == [2025, 2028]
+
+ # if no years are provided, returns a known leap year and non-leap year
+ assert converter.representative_years() == [
+ converter.LEAP_YEAR,
+ converter.NON_LEAP_YEAR,
+ ]
+ assert converter.representative_years([]) == [
+ converter.LEAP_YEAR,
+ converter.NON_LEAP_YEAR,
+ ]
diff --git a/tests/test_converters/test_calendars/test_hebrew/test_hebrew_converter.py b/tests/test_converters/test_calendars/test_hebrew/test_hebrew_converter.py
index c3c8b7c..6fe8c96 100644
--- a/tests/test_converters/test_calendars/test_hebrew/test_hebrew_converter.py
+++ b/tests/test_converters/test_calendars/test_hebrew/test_hebrew_converter.py
@@ -153,3 +153,37 @@ def test_compare_across_calendars(self):
)
expected_gregorian_years = [-3261, 33, 1056, 1350, 1655, 1995]
assert [d.earliest.year for d in sorted_dates] == expected_gregorian_years
+
+ def test_days_in_year(self):
+ converter = HebrewDateConverter()
+ assert converter.days_in_year(4816) == 353
+ assert converter.days_in_year(4817) == 355
+ assert converter.days_in_year(4818) == 384
+ assert converter.days_in_year(4819) == 355
+
+ def test_representative_years(self):
+ converter = HebrewDateConverter()
+ # single year is not filtered
+ assert converter.representative_years([4816]) == [4816]
+ # 4816 has 353 days; 4817 has 355; 4818 has 384; 4819 has 355
+ assert converter.representative_years([4816, 4817, 4818, 4819]) == [
+ 4816,
+ 4817,
+ 4818,
+ ]
+ assert converter.representative_years([4816, 4817, 4818, 4819, 4837]) == [
+ 4816,
+ 4817,
+ 4818,
+ 4837,
+ ]
+
+ # if no years are provided, returns a known leap year and non-leap years
+ assert converter.representative_years() == [
+ converter.LEAP_YEAR,
+ converter.NON_LEAP_YEAR,
+ ]
+ assert converter.representative_years([]) == [
+ converter.LEAP_YEAR,
+ converter.NON_LEAP_YEAR,
+ ]
diff --git a/tests/test_converters/test_calendars/test_hebrew/test_hebrew_transformer.py b/tests/test_converters/test_calendars/test_hebrew/test_hebrew_transformer.py
index 6e4a5e6..7dcca83 100644
--- a/tests/test_converters/test_calendars/test_hebrew/test_hebrew_transformer.py
+++ b/tests/test_converters/test_calendars/test_hebrew/test_hebrew_transformer.py
@@ -26,6 +26,12 @@ def test_hebrew_undate():
("5362", HebrewUndate(5362), DatePrecision.YEAR),
# add when we support parsing ranges:
# Adar I and Adar II 5453 : (1693 CE)
+ # support weekdays included in text
+ ("Thursday, 12 Sivan 4795", HebrewUndate(4795, 3, 12), DatePrecision.DAY),
+ # with or without comma
+ ("Thursday 12 Sivan 4795", HebrewUndate(4795, 3, 12), DatePrecision.DAY),
+ # huh, current parsing completely ignores whitespace; do we want that?
+ ("Thursday12Sivan4795", HebrewUndate(4795, 3, 12), DatePrecision.DAY),
]
diff --git a/tests/test_converters/test_calendars/test_islamic/test_islamic_converter.py b/tests/test_converters/test_calendars/test_islamic/test_islamic_converter.py
index 4acacd0..cfcace2 100644
--- a/tests/test_converters/test_calendars/test_islamic/test_islamic_converter.py
+++ b/tests/test_converters/test_calendars/test_islamic/test_islamic_converter.py
@@ -152,3 +152,23 @@ def test_compare_across_calendars(self):
)
expected_gregorian_years = [33, 1049, 1350, 1479, 1495, 1995]
assert [d.earliest.year for d in sorted_dates] == expected_gregorian_years
+
+ def test_representative_years(self):
+ converter = IslamicDateConverter()
+ # single year is not filtered
+ # 1458 is a leap year; 1457 and 1459 are not
+ assert converter.representative_years([1457]) == [1457]
+ # multiple non-leap years, returns just the first
+ assert converter.representative_years([1457, 1459]) == [1457]
+ # next leap year is 2028; returns first leap year and first non-leap year, in input order
+ assert converter.representative_years([1457, 1458, 1459]) == [1457, 1458]
+
+ # if no years are provided, returns a known leap year and non-leap years
+ assert converter.representative_years() == [
+ converter.LEAP_YEAR,
+ converter.NON_LEAP_YEAR,
+ ]
+ assert converter.representative_years([]) == [
+ converter.LEAP_YEAR,
+ converter.NON_LEAP_YEAR,
+ ]
diff --git a/tests/test_converters/test_calendars/test_islamic/test_islamic_transformer.py b/tests/test_converters/test_calendars/test_islamic/test_islamic_transformer.py
index 951a9f8..04ff53b 100644
--- a/tests/test_converters/test_calendars/test_islamic/test_islamic_transformer.py
+++ b/tests/test_converters/test_calendars/test_islamic/test_islamic_transformer.py
@@ -28,6 +28,7 @@ def test_islamic_undate():
# examples from ISMI data (reformatted to day month year)
# Rabi 1 = month 3
("14 Rabīʿ I 901", IslamicUndate(901, 3, 14), DatePrecision.DAY),
+ ("Rabīʿ I 490", IslamicUndate(490, 3), DatePrecision.MONTH),
("884", IslamicUndate(884), DatePrecision.YEAR),
# Gregorian: UndateInterval(Undate(1479, 4, 3), Undate(1480, 3, 21)),
# add when we support parsing ranges:
diff --git a/tests/test_converters/test_calendars/test_seleucid.py b/tests/test_converters/test_calendars/test_seleucid.py
new file mode 100644
index 0000000..d07e5f1
--- /dev/null
+++ b/tests/test_converters/test_calendars/test_seleucid.py
@@ -0,0 +1,115 @@
+from undate.converters.calendars import SeleucidDateConverter
+from undate.date import Date, DatePrecision
+from undate.undate import Calendar, Undate
+
+
+class TestSeleucidDateConverter:
+ def test_parse(self):
+ # day
+ # Elul = month 6; 11 September, 1000 Gregorian
+ date_str = "29 Elul 1311"
+ date = SeleucidDateConverter().parse(date_str)
+ assert date == Undate(1311, 6, 29, calendar="Seleucid")
+ assert date.calendar == Calendar.SELEUCID
+ assert date.precision == DatePrecision.DAY
+ assert date.label == f"{date_str} {SeleucidDateConverter.calendar_name}"
+
+ date_str = "Tishri 1458" # month 7
+ date = SeleucidDateConverter().parse(date_str)
+ assert date == Undate(1458, 7, calendar="Seleucid")
+ assert date.calendar == Calendar.SELEUCID
+ assert date.precision == DatePrecision.MONTH
+ assert date.label == f"{date_str} {SeleucidDateConverter.calendar_name}"
+
+ # year
+ date_str = "1458"
+ date = SeleucidDateConverter().parse(date_str)
+ assert date == Undate(1458, calendar="Seleucid")
+ assert date.calendar == Calendar.SELEUCID
+ assert date.precision == DatePrecision.YEAR
+ assert date.label == f"{date_str} {SeleucidDateConverter.calendar_name}"
+
+ def test_gregorian_earliest_latest(self):
+ # earliest/latest should be converted to Gregorian for comparison
+
+ # full date
+ # Elul = month 6 (7 September, 1000 Gregorian)
+ date_str = "29 Elul 1311"
+ date = SeleucidDateConverter().parse(date_str)
+ assert date.earliest == Date(1000, 9, 7)
+ assert date.latest == Date(1000, 9, 7)
+ assert date.label == f"{date_str} {SeleucidDateConverter.calendar_name}"
+
+ date_str = "23 Adar I 1475"
+ date = SeleucidDateConverter().parse(date_str)
+ assert date.earliest == Date(1164, 2, 25)
+ assert date.latest == Date(1164, 2, 25)
+ assert date.label == f"{date_str} {SeleucidDateConverter.calendar_name}"
+
+ # month/year
+ date_str = "Tishri 1458"
+ date = SeleucidDateConverter().parse(date_str)
+ assert date.earliest == Date(1146, 9, 16)
+ assert date.latest == Date(1146, 10, 15)
+ assert date.label == f"{date_str} {SeleucidDateConverter.calendar_name}"
+
+ def test_days_in_year(self):
+ converter = SeleucidDateConverter()
+ assert converter.days_in_year(2350) == 354
+ assert converter.days_in_year(2349) == 385
+ assert converter.days_in_year(2351) == 355
+
+
+# TODO: update validation error to say seleucid instead of hebrew
+
+# seleucid_year = 1458
+# converted_date = convert_seleucid_date(f"Tishri {seleucid_year}")
+# converted_date_am = convert_hebrew_date(
+# f"Tishrei {seleucid_year + Calendar.SELEUCID_OFFSET}"
+# )
+# # the converted date range for Tishri Sel. should be the same as that for Tishri AM - 3449 years.
+# assert converted_date[0] == converted_date_am[0]
+# assert converted_date[1] == converted_date_am[1]
+
+# # leap day (Feb 29, 2020) should convert properlyd
+# converted_date = convert_seleucid_date("4 Adar 2331")
+# assert converted_date[1] == date(2020, 2, 29)
+
+
+# # 26 Tammuz 4816: 17 July, 1056; Tammuz = month 4
+# date = Undate(4816, 4, 26, calendar="Seleucid")
+# assert date.earliest == Date(1056, 7, 17)
+# assert date.latest == Date(1056, 7, 17)
+# # 13 Tishrei 5416 Anno Mundi (1655-10-14)
+# date = Undate(5416, 7, 13, calendar="Seleucid") # Tishrei = month 7
+# assert date.earliest == Date(1655, 10, 14)
+# assert date.latest == Date(1655, 10, 14)
+
+
+# from pgp tests
+
+
+# # month/year
+# seleucid_year = 1458
+# converted_date = convert_seleucid_date(f"Tishri {seleucid_year}")
+# converted_date_am = convert_hebrew_date(
+# f"Tishrei {seleucid_year + Calendar.SELEUCID_OFFSET}"
+# )
+# # the converted date range for Tishri Sel. should be the same as that for Tishri AM - 3449 years.
+# assert converted_date[0] == converted_date_am[0]
+# assert converted_date[1] == converted_date_am[1]
+
+# # leap day (Feb 29, 2020) should convert properly
+# converted_date = convert_seleucid_date("4 Adar 2331")
+# assert converted_date[1] == date(2020, 2, 29)
+
+# # leap year (4826 AM = 1377 Seleucid) should convert properly
+# seleucid_year = 1377
+# converted_date = convert_seleucid_date(f"21 Adar II {seleucid_year}")
+# converted_date_am = convert_hebrew_date(
+# f"21 Adar II {seleucid_year + Calendar.SELEUCID_OFFSET}"
+# )
+# assert converted_date[0] == converted_date_am[0]
+# assert converted_date[1] == converted_date_am[1]
+# # and it should be converted to 1066-03-21 CE
+# assert converted_date[1] == date(1066, 3, 21)
diff --git a/tests/test_date.py b/tests/test_date.py
index 5ff017d..24703cb 100644
--- a/tests/test_date.py
+++ b/tests/test_date.py
@@ -1,5 +1,18 @@
+import datetime
+
import numpy as np
-from undate.date import ONE_YEAR, Date, DatePrecision, Timedelta
+import pytest
+
+from undate.date import (
+ ONE_DAY,
+ ONE_YEAR,
+ ONE_MONTH_MAX,
+ Date,
+ DatePrecision,
+ Timedelta,
+ UnDelta,
+ UnInt,
+)
class TestDatePrecision:
@@ -51,6 +64,31 @@ def test_properties_day(self):
assert Date(2010, 5).day is None
assert Date(2021, 6, 15).day == 15
+ def test_weekday(self):
+ # thursday
+ assert Date(2025, 1, 2).weekday == 3
+ assert Date(2025, 1, 2).weekday == datetime.date(2025, 1, 2).weekday()
+ # friday
+ assert Date(2025, 1, 3).weekday == 4
+ assert Date(2025, 1, 3).weekday == datetime.date(2025, 1, 3).weekday()
+ # saturday
+ assert Date(2025, 1, 4).weekday == 5
+ assert Date(2025, 1, 4).weekday == datetime.date(2025, 1, 4).weekday()
+ # sunday
+ assert Date(2025, 1, 5).weekday == 6
+ assert Date(2025, 1, 5).weekday == datetime.date(2025, 1, 5).weekday()
+ # monday
+ assert Date(2025, 1, 6).weekday == 0
+ assert Date(2025, 1, 6).weekday == datetime.date(2025, 1, 6).weekday()
+ # tuesday
+ assert Date(2025, 1, 7).weekday == 1
+ assert Date(2025, 1, 7).weekday == datetime.date(2025, 1, 7).weekday()
+
+ # when a date is not day-level precision, no weekday is returned
+ yearonly_date = Date(2025)
+ assert yearonly_date.dtype == "datetime64[Y]"
+ assert yearonly_date.weekday is None
+
def test_substract(self):
# date - date = timedelta
date_difference = Date(2024, 1, 2) - Date(2024, 1, 1)
@@ -77,3 +115,158 @@ def test_init_from_np_timedelta64(self):
def test_days(self):
assert Timedelta(10).days == 10
+
+
+class TestUnInt:
+ def test_init(self):
+ february_days = UnInt(28, 29) # 28 or 29
+ assert february_days.lower == 28
+ assert february_days.upper == 29
+
+ # also supports keyword args
+ anymonth_days = UnInt(lower=28, upper=31)
+ assert anymonth_days.lower == 28
+ assert anymonth_days.upper == 31
+
+ def test_init_validation(self):
+ with pytest.raises(
+ ValueError, match=r"Lower value \(10\) must be less than upper \(4\)"
+ ):
+ UnInt(10, 4)
+
+ def test_contains(self):
+ anymonth_days = UnInt(lower=28, upper=31)
+ # integer
+ assert 28 in anymonth_days
+ assert 29 in anymonth_days
+ assert 31 in anymonth_days
+ assert 32 not in anymonth_days
+ # unint
+ assert UnInt(28, 29) in anymonth_days
+
+ # other types are assumed not in range
+ assert "twenty-eight" not in anymonth_days
+
+ def test_gt(self):
+ ten_twelve = UnInt(10, 12)
+ # compare with integer
+ assert ten_twelve > 9
+ assert not ten_twelve > 12
+ assert not ten_twelve > 15
+ # compare with unint
+ assert ten_twelve > UnInt(2, 4)
+ assert not ten_twelve > UnInt(12, 24)
+ assert not ten_twelve > UnInt(13, 23)
+ # unsupported type
+ with pytest.raises(TypeError):
+ ten_twelve > "three"
+
+ def test_lt(self):
+ ten_twelve = UnInt(10, 12)
+ # compare with integer
+ assert ten_twelve < 13
+ assert not ten_twelve < 12
+ assert not ten_twelve < 9
+ # compare with unint
+ assert ten_twelve < UnInt(13, 23)
+ assert not ten_twelve < UnInt(12, 24)
+ assert not ten_twelve < UnInt(2, 4)
+ # unsupported type
+ with pytest.raises(TypeError):
+ ten_twelve < "three"
+
+ def test_iterable(self):
+ anymonth_days = UnInt(lower=28, upper=31)
+ assert list(anymonth_days) == [28, 29, 30, 31]
+
+ def test_add(self):
+ february_days = UnInt(28, 29)
+ # add integer
+ assert february_days + 1 == UnInt(29, 30)
+ # add UnInt - minimum is 28 + 1, maximum is 29 + 2
+ assert february_days + UnInt(1, 2) == UnInt(29, 31)
+ # other types are not supported
+ with pytest.raises(TypeError, match="unsupported operand"):
+ february_days + "two"
+
+ def test_subtract(self):
+ february_days = UnInt(28, 29)
+ # subtract integer
+ assert february_days - 10 == UnInt(18, 19)
+ # subtract UnInt - minimum is lower - largest value, maximum is upper - smallest value
+ # difference between number of days in any month and the month of February?
+ # [28,31] - [28,29] = [-1, 3]
+ anymonth_days = UnInt(lower=28, upper=31)
+ assert anymonth_days - february_days == UnInt(-1, 3)
+ # what if we go the other direction?
+ assert february_days - anymonth_days == UnInt(-3, 1)
+ # other types are not supported
+ with pytest.raises(TypeError, match="unsupported operand"):
+ february_days - "two"
+
+
+class TestUnDelta:
+ def test_init(self):
+ # February in an unknown year in Gregorian calendar could be 28 or 29 days
+ february_days = UnInt(28, 29) # 28 or 29
+ udelt = UnDelta(28, 29)
+ assert isinstance(udelt.days, UnInt)
+ assert udelt.days.lower == 28
+ assert udelt.days.upper == 29
+
+ # NOTE: default portion interval comparison may not be what we want here,
+ # since this is an unknown value within the range...
+ # (maybe handled in undelta class comparison methods)
+ assert udelt.days == february_days
+
+ # do the right thing with more than one value, out of order
+ unknown_month_duration = UnDelta(30, 31, 28)
+ assert isinstance(unknown_month_duration.days, UnInt)
+ assert unknown_month_duration.days.lower == 28
+ assert unknown_month_duration.days.upper == 31
+
+ def test_init_validation(self):
+ with pytest.raises(ValueError, match="Must specify at least two values"):
+ UnDelta(10)
+
+ def test_repr(self):
+ # customized string representation
+ assert repr(UnDelta(28, 29)) == "UnDelta(days=[28,29])"
+
+ def test_eq(self):
+ # uncertain deltas are not equivalent
+ udelt1 = UnDelta(30, 31)
+ udelt2 = UnDelta(30, 31)
+ # not equal to equivalent undelta range
+ assert udelt1 != udelt2
+ # equal to self
+ assert udelt1 is udelt1
+
+ def test_lt(self):
+ week_or_tenday = UnDelta(7, 10)
+ # compare undelta with undelta
+ month = UnDelta(28, 31)
+ # a week or ten-day is unambiguously less than a month
+ assert week_or_tenday < month
+ # compare undelta with Timedelta
+ # NOTE: currently requires this direction, until we update Timedelta
+ assert not week_or_tenday < ONE_DAY
+ # an uncertain month is unambiguously less than a year
+ assert month < ONE_YEAR
+ # an uncertain month may or may not be less than one month max
+ assert not month < ONE_MONTH_MAX
+
+ def test_gt(self):
+ week_or_tenday = UnDelta(7, 10)
+ # compare undelta with undelta
+ month = UnDelta(28, 31)
+ # a month is unambiguously longer than week or ten-day
+ assert month > week_or_tenday
+ # compare undelta with Timedelta
+ # NOTE: currently requires this direction, until we update Timedelta
+ # to support the reverse comparison
+ assert week_or_tenday > ONE_DAY
+ # an uncertain month is not greater than a year
+ assert not month > ONE_YEAR
+ # an uncertain month may or may not be greater than one month max
+ assert not month > ONE_MONTH_MAX
diff --git a/tests/test_undate.py b/tests/test_undate.py
index 18e03b0..2cbaf7d 100644
--- a/tests/test_undate.py
+++ b/tests/test_undate.py
@@ -1,10 +1,13 @@
from datetime import date, datetime
+from enum import auto
+from unittest import mock
import pytest
from undate import Undate, UndateInterval, Calendar
-from undate.converters.base import BaseCalendarConverter
-from undate.date import Date, DatePrecision, Timedelta
+from undate.undate import StrEnum # import whichever version is used there
+from undate.converters.base import BaseCalendarConverter, BaseDateConverter
+from undate.date import Date, DatePrecision, Timedelta, UnDelta, UnInt
class TestUndate:
@@ -130,6 +133,16 @@ def test_calendar(self):
with pytest.raises(ValueError, match="Calendar `foobar` is not supported"):
Undate(848, calendar="foobar")
+ def test_as_calendar(self):
+ # changes calendar *without* converting dates
+ assert Undate(1243, 5, 7).as_calendar(Calendar.ISLAMIC) == Undate(
+ 1243, 5, 7, calendar=Calendar.ISLAMIC
+ )
+ # should also work with string
+ assert Undate(1243, 5, 7).as_calendar("islamic") == Undate(
+ 1243, 5, 7, calendar=Calendar.ISLAMIC
+ )
+
def test_init_invalid(self):
with pytest.raises(ValueError):
Undate("19??")
@@ -298,11 +311,17 @@ def test_lt_notimplemented(self):
# how to compare mixed precision where dates overlap?
# if the second date falls *within* earliest/latest,
# then it is not clearly less; not implemented?
- with pytest.raises(NotImplementedError, match="date falls within the other"):
+ with pytest.raises(
+ NotImplementedError,
+ match="one date \\(2022-05\\) falls within the other \\(2022\\)",
+ ):
assert Undate(2022) < Undate(2022, 5)
# same if we attempt to compare in the other direction
- with pytest.raises(NotImplementedError, match="date falls within the other"):
+ with pytest.raises(
+ NotImplementedError,
+ match="one date \\(2022-05\\) falls within the other \\(2022\\)",
+ ):
assert Undate(2022, 5) < Undate(2022)
testdata_contains = [
@@ -377,6 +396,45 @@ def test_sorting(self):
# someyear = Undate("1XXX")
# assert sorted([d1991, someyear]) == [someyear, d1991]
+ def test_possible_years(self):
+ assert Undate(1991).possible_years == [1991]
+ assert Undate("190X").possible_years == range(1900, 1910)
+ assert Undate("19XX").possible_years == range(1900, 2000)
+ # uses step when missing digit is not last digit
+ assert Undate("19X1").possible_years == range(1901, 1992, 10)
+ assert Undate("2X25").possible_years == range(2025, 2926, 100)
+ assert Undate("1XXX").possible_years == range(1000, 2000)
+ # completely unknown year raises value error, because the range is not useful
+ with pytest.raises(
+ ValueError, match="cannot be returned for completely unknown year"
+ ):
+ assert Undate("XXXX").possible_years
+
+ def test_representative_years(self):
+ # single year is returned as is
+ assert Undate("1991").representative_years == [1991]
+ # for an uncertain year, returns first leap year and non-leap year in range
+ assert Undate("190X").representative_years == [1900, 1904]
+ assert Undate("19XX").representative_years == [1900, 1904]
+ # works for other calendars
+ assert Undate("481X", calendar="Hebrew").representative_years == [
+ 4810,
+ 4811,
+ 4812,
+ 4813,
+ 4816,
+ 4818,
+ ]
+
+ # use mock to simulate a calendar without representative years filtering
+ with mock.patch(
+ "undate.converters.calendars.HebrewDateConverter.representative_years"
+ ) as mock_representative_years:
+ mock_representative_years.side_effect = NotImplementedError
+ assert Undate("481X", calendar="Hebrew").representative_years == list(
+ range(4810, 4820)
+ )
+
def test_duration(self):
day_duration = Undate(2022, 11, 7).duration()
assert isinstance(day_duration, Timedelta)
@@ -404,10 +462,39 @@ def test_partiallyknown_duration(self):
# month in unknown year
assert Undate(month=6).duration().days == 30
# partially known month
- assert Undate(year=1900, month="1X").duration().days == 31
- # what about february?
- # could vary with leap years, but assume non-leapyear
- assert Undate(month=2).duration().days == 28
+ # 1X = October, November, or December = 30 or 31 days
+ # should return a Undelta object
+ unknown_month_duration = Undate(year=1900, month="1X").duration()
+ assert isinstance(unknown_month_duration, UnDelta)
+ assert unknown_month_duration.days == UnInt(30, 31)
+
+ # completely unknown month should also return a Undelta object
+ unknown_month_duration = Undate(year=1900, month="XX").duration()
+ assert isinstance(unknown_month_duration, UnDelta)
+ # possible range is 28 to 31 days
+ assert unknown_month_duration.days == UnInt(28, 31)
+
+ # the number of days in February of an unknown year is uncertain, since
+ # it could vary with leap years; either 28 or 29 days
+ feb_duration = Undate(month=2).duration()
+ assert isinstance(feb_duration, UnDelta)
+ assert feb_duration.days == UnInt(28, 29)
+
+ def test_partiallyknownyear_duration(self):
+ assert Undate("190X").duration().days == UnInt(365, 366)
+ assert Undate("XXXX").duration().days == UnInt(365, 366)
+ # if possible years don't include any leap years, duration is not ambiguous
+ assert Undate("19X1").duration().days == 365
+ # year duration logic should work in other calendars
+ # islamic
+ assert Undate("108X", calendar="Islamic").duration().days == UnInt(354, 355)
+ # completely unknown years is calculated based on representative years
+ assert Undate("XXXX", calendar="Islamic").duration().days == UnInt(354, 355)
+ assert Undate("536X", calendar="Hebrew").duration().days == UnInt(353, 385)
+ # different set of years could vary
+ assert Undate("53X2", calendar="Hebrew").duration().days == UnInt(354, 385)
+ # fully unknown year also works for Hebrew calendar
+ assert Undate("XXX", calendar="Hebrew").duration().days == UnInt(353, 385)
def test_known_year(self):
assert Undate(2022).known_year is True
@@ -479,3 +566,25 @@ def test_calendar_get_converter():
converter = Calendar.get_converter(cal)
assert isinstance(converter, BaseCalendarConverter)
assert converter.name.lower() == cal.name.lower()
+
+ class BogusCalendar(StrEnum):
+ """Unsupported calendars"""
+
+ FOOBAR = auto()
+ DUMMY = auto()
+
+ # test error handling
+ # ensure we raise a ValueError when an invalid calendar is requested
+ with pytest.raises(ValueError, match="Unknown calendar"):
+ Calendar.get_converter(BogusCalendar.FOOBAR)
+
+ class DummyFormatter(BaseDateConverter):
+ name = "Dummy"
+
+ # also error if you request a converter that is not a calendar converter
+ # NOTE: this fails because get_converter converts the enum to title case...
+ # can't be tested with any of the existing non-calendar converters
+ with pytest.raises(
+ ValueError, match="Requested converter 'Dummy' is not a CalendarConverter"
+ ):
+ Calendar.get_converter(BogusCalendar.DUMMY)