Coverage for src / crawler / by_source / dmlbul_crawler.py: 13%
82 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-23 15:27 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-23 15:27 +0000
1import json
3import regex
4from ptf.model_data import (
5 ArticleData,
6 IssueData,
7 create_abstract,
8 create_articledata,
9 create_contributor,
10 create_subj,
11)
13from crawler.base_crawler import BaseCollectionCrawler
14from crawler.crawler_utils import get_issue_pid
15from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
18# TODO : PID FCAA_2006_9_2_a5 : errata on the same page as article (2 pdfs)
19# Handle this by creating another article as an errata https://www.numdam.org/item/SEDP_1971-1972____A1_0/
20class DmlbulCrawler(BaseCollectionCrawler):
21 source_name = "Bulgarian Digital Mathematics Library"
22 source_domain = "DMLBUL"
23 source_website = "https://buldml.math.bas.bg"
25 # Regexes to detect issue metadata, from most specific to least
26 issue_regexes: list[str] = [
27 # MEM
28 r"[\w ]+, Vol\. (?P<volume>\d+), No (?P<number>[\d\-\w]+), \((?P<year>\d+)\)(?:, +(?P<pagestart>\d+)p-(?P<pageend>\d+)p)?",
29 r"\((?P<year>\d+)\).+, (?:(?P<volume>\d+)(?:\((?P<number>[\d\w\-−–]+)\))?), (?P<pagestart>\d+)(?: ?[\-−–] ?(?P<pageend>\d+))?\.",
30 # SMJ2
31 r"Volume (?P<volume>\d+), Number (?P<number>[\d\w\-]+), (?P<year>\d+), pp. (?P<pagestart>\d+)[\-−](?P<pageend>\d+)",
32 r"\((?P<year>\d+)\).+, (?:(?P<volume>\d+)(?:\((?P<number>[\d\w\-−–]+)\))?).",
33 ]
35 def parse_collection_content(self, content):
36 """
37 Parse the HTML page of Annals of Math and returns a list of xissue.
38 Each xissue has its pid/volume/number/year metadata + its url
39 """
40 data = json.loads(content)
41 id = data["_id"]
42 articles = self.query_article_list(id)
43 issues = self.extract_issue_list(articles)
44 return issues
46 def query_article_list(self, id: str):
47 articles_data = []
48 count = float("inf")
49 while len(articles_data) < count:
50 request = self.session.post(
51 "https://buldml.math.bas.bg/api/object/search",
52 json={
53 "search": [
54 {"match": {"_t": "ct:Item", "category._rr": id, "*oid": ["category._rr"]}}
55 ],
56 "skip": len(articles_data),
57 "limit": min(200, count - len(articles_data)),
58 "add": {"identifier": "$identifier", "_alias": "$_alias"},
59 "sort": {"issue-date.year": 1, "_id": 1},
60 },
61 )
62 str_data = self.decode_response(request)
63 data = json.loads(str_data)
64 count = data["count"]
65 articles_data.extend(data["data"])
66 return articles_data
68 def extract_issue_list(self, article_list: list[dict]):
69 issues: dict[str, IssueData] = {}
70 for _a in article_list:
71 article = create_articledata()
72 issue_str = cleanup_str(_a["identifier"]["citation"]["_v"])
74 issue_re = None
75 re_index = 0
76 MAX_RETRIES = len(self.issue_regexes)
77 while issue_re is None and re_index < MAX_RETRIES:
78 issue_re = regex.search(self.issue_regexes[re_index], issue_str)
79 re_index += 1
81 if not issue_re:
82 self.logger.error(
83 f"[{self.source_domain}] {self.collection_id} : Article cannot be parsed",
84 extra={"input": _a},
85 )
86 continue
87 issue_dict = issue_re.groupdict()
89 if issue_dict.get("pagestart", None):
90 article.fpage = issue_dict["pagestart"]
91 if issue_dict.get("pageend", None):
92 article.lpage = issue_dict["pageend"]
94 article.url = "https://buldml.math.bas.bg/api/object/a/" + _a["_alias"]["_v"]
96 pid = get_issue_pid(
97 self.collection_id,
98 issue_dict["year"],
99 issue_dict["volume"],
100 issue_dict.get("number", None),
101 )
102 if pid in issues:
103 issue = issues[pid]
104 else:
105 issue = self.create_xissue(
106 None, issue_dict["year"], issue_dict["volume"], issue_dict.get("number", None)
107 )
108 issues[pid] = issue
110 article.pid = "a" + str(len(issues[pid].articles))
111 issue.articles.append(article)
112 return list(issues.values())
114 def parse_article_content(
115 self,
116 content: str,
117 xissue: IssueData,
118 xarticle: ArticleData,
119 url: str,
120 ):
121 data = json.loads(content)
122 # title
123 xarticle.title_tex = data["title"]["_v"]
124 # lang
125 xarticle.lang = data["language"]["_v"]
126 if xarticle.lang == "other":
127 xarticle.lang = "und"
128 # authors
129 for _a in data["authors"]["_v"]:
130 author = create_contributor(string_name=_a, role="author")
131 xarticle.contributors.append(author)
132 # abstracts
133 if "abstract" in data:
134 xarticle.abstracts.append(
135 create_abstract(lang=xarticle.lang, value_tex=data["abstract"]["_v"])
136 )
137 # keywords
138 if "subject" in data:
139 for _k in data["subject"]["_v"]:
140 keyword = create_subj(lang=xarticle.lang, value=_k)
141 xarticle.kwds.append(keyword)
142 # pdf
143 if not xarticle.url:
144 raise ValueError(
145 f"[{self.source_domain}] {self.collection_id} : Article doesn't have an url"
146 )
147 if "uri" in data["identifier"]:
148 url = data["identifier"]["uri"]["_v"]
149 else:
150 url = "https://hdl.handle.net/10525/" + data["_alias"]["_v"]
151 self.logger.info(
152 f"Article URL not found, falling back with handle.net. Please check link validity : {url}",
153 extra={"pid": xarticle.pid},
154 )
155 xarticle.url = url
156 if len(data["media"]) == 0:
157 raise ValueError(f"[{self.source_domain}] {self.collection_id} : PDF Not found")
158 for media in data["media"]:
159 add_pdf_link_to_xarticle(
160 xarticle, "https://buldml.math.bas.bg/asset/default/" + media["file"]["_v"]
161 )
162 return xarticle