@@ -33,7 +33,7 @@ public function parsePageHtml(string $filename, string $spaceName): array
3333 ];
3434 }
3535
36- public function htmlFile2Markdown (string $ filename )
36+ public function htmlFile2Markdown (string $ filename ): string
3737 {
3838 libxml_use_internal_errors (true );
3939 $ this ->document ->loadHTMLFile ($ filename );
@@ -42,6 +42,39 @@ public function htmlFile2Markdown(string $filename)
4242 return $ this ->htmlConverter ->convert ($ html );
4343 }
4444
45+ /**
46+ * parse attachments. if markdown is not empty, ignore images in it.
47+ */
48+ public function parseAttachments ($ htmlFilename , $ markdownContent = '' ): array
49+ {
50+ libxml_use_internal_errors (true );
51+ $ this ->document ->loadHTMLFile ($ htmlFilename );
52+ $ divElements = $ this ->document ->getElementById ('content ' )->getElementsByTagName ('div ' );
53+ $ divElement = null ;
54+ foreach ($ divElements as $ divElement ) {
55+ if ($ divElement ->getAttribute ('class ' ) != 'pageSection ' ) {
56+ continue ;
57+ }
58+ $ h2Element = $ divElement ->getElementsByTagName ('h2 ' )[0 ];
59+ if (!empty ($ h2Element ) && $ h2Element ->id == 'attachments ' ) {
60+ break ;
61+ }
62+ }
63+ if (empty ($ divElement )) {
64+ return [];
65+ }
66+ $ aElements = $ divElement ->getElementsByTagName ('a ' );
67+ $ attachments = [];
68+ foreach ($ aElements as $ aElement ) {
69+ $ filePath = $ aElement ->getAttribute ('href ' );
70+ $ filename = $ aElement ->nodeValue ;
71+ if (!str_contains ($ markdownContent , ") {
72+ $ attachments [$ filePath ] = $ filename ;
73+ }
74+ }
75+ return $ attachments ;
76+ }
77+
4578 /**
4679 * @return array ['tree' => "array", 'titles' => "array"]
4780 */
0 commit comments