Coverage for src/crawler/by_source/dmlbul_crawler.py: 96%
81 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-07-30 09:47 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-07-30 09:47 +0000
1import json
3import regex
4from ptf.model_data import (
5 ArticleData,
6 IssueData,
7 create_abstract,
8 create_articledata,
9 create_contributor,
10 create_subj,
11)
13from crawler.base_crawler import BaseCollectionCrawler
14from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
17# TODO : PID FCAA_2006_9_2_a5 : errata on the same page as article (2 pdfs)
18# Handle this by creating another article as an errata https://www.numdam.org/item/SEDP_1971-1972____A1_0/
19class DmlbulCrawler(BaseCollectionCrawler):
20 source_name = "Bulgarian Digital Mathematics Library"
21 source_domain = "DMLBUL"
22 source_website = "https://buldml.math.bas.bg"
24 # Regexes to detect issue metadata, from most specific to least
25 issue_regexes: list[str] = [
26 # MEM
27 r"[\w ]+, Vol\. (?P<volume>\d+), No (?P<number>[\d\-\w]+), \((?P<year>\d+)\)(?:, +(?P<pagestart>\d+)p-(?P<pageend>\d+)p)?",
28 r"\((?P<year>\d+)\).+, (?:(?P<volume>\d+)(?:\((?P<number>[\d\w\-−–]+)\))?), (?P<pagestart>\d+)(?: ?[\-−–] ?(?P<pageend>\d+))?\.",
29 # SMJ2
30 r"Volume (?P<volume>\d+), Number (?P<number>[\d\w\-]+), (?P<year>\d+), pp. (?P<pagestart>\d+)[\-−](?P<pageend>\d+)",
31 r"\((?P<year>\d+)\).+, (?:(?P<volume>\d+)(?:\((?P<number>[\d\w\-−–]+)\))?).",
32 ]
34 def parse_collection_content(self, content):
35 """
36 Parse the HTML page of Annals of Math and returns a list of xissue.
37 Each xissue has its pid/volume/number/year metadata + its url
38 """
39 data = json.loads(content)
40 id = data["_id"]
41 articles = self.query_article_list(id)
42 issues = self.extract_issue_list(articles)
43 return issues
45 def query_article_list(self, id: str):
46 articles_data = []
47 count = float("inf")
48 while len(articles_data) < count:
49 request = self.session.post(
50 "https://buldml.math.bas.bg/api/object/search",
51 json={
52 "search": [
53 {"match": {"_t": "ct:Item", "category._rr": id, "*oid": ["category._rr"]}}
54 ],
55 "skip": len(articles_data),
56 "limit": min(200, count - len(articles_data)),
57 "add": {"identifier": "$identifier", "_alias": "$_alias"},
58 "sort": {"issue-date.year": 1, "_id": 1},
59 },
60 )
61 str_data = self.decode_response(request)
62 data = json.loads(str_data)
63 count = data["count"]
64 articles_data.extend(data["data"])
65 return articles_data
67 def extract_issue_list(self, article_list: list[dict]):
68 issues: dict[str, IssueData] = {}
69 for _a in article_list:
70 article = create_articledata()
71 issue_str = cleanup_str(_a["identifier"]["citation"]["_v"])
73 issue_re = None
74 re_index = 0
75 MAX_RETRIES = len(self.issue_regexes)
76 while issue_re is None and re_index < MAX_RETRIES:
77 issue_re = regex.search(self.issue_regexes[re_index], issue_str)
78 re_index += 1
80 if not issue_re:
81 self.logger.error(
82 f"[{self.source_domain}] {self.collection_id} : Article cannot be parsed",
83 extra={"input": _a},
84 )
85 continue
86 issue_dict = issue_re.groupdict()
88 if issue_dict.get("pagestart", None):
89 article.fpage = issue_dict["pagestart"]
90 if issue_dict.get("pageend", None):
91 article.lpage = issue_dict["pageend"]
93 article.url = "https://buldml.math.bas.bg/api/object/a/" + _a["_alias"]["_v"]
95 pid = self.get_issue_pid(
96 self.collection_id,
97 issue_dict["year"],
98 issue_dict["volume"],
99 issue_dict.get("number", None),
100 )
101 if pid in issues:
102 issue = issues[pid]
103 else:
104 issue = self.create_xissue(
105 None, issue_dict["year"], issue_dict["volume"], issue_dict.get("number", None)
106 )
107 issues[pid] = issue
109 article.pid = "a" + str(len(issues[pid].articles))
110 issue.articles.append(article)
111 return list(issues.values())
113 def parse_article_content(
114 self,
115 content: str,
116 xissue: IssueData,
117 xarticle: ArticleData,
118 url: str,
119 ):
120 data = json.loads(content)
121 # title
122 xarticle.title_tex = data["title"]["_v"]
123 # lang
124 xarticle.lang = data["language"]["_v"]
125 if xarticle.lang == "other":
126 xarticle.lang = "und"
127 # authors
128 for _a in data["authors"]["_v"]:
129 author = create_contributor(string_name=_a, role="author")
130 xarticle.contributors.append(author)
131 # abstracts
132 if "abstract" in data:
133 xarticle.abstracts.append(
134 create_abstract(
135 lang=xarticle.lang, value_tex=data["abstract"]["_v"], tag="abstract"
136 )
137 )
138 # keywords
139 if "subject" in data: 139 ↛ 144line 139 didn't jump to line 144 because the condition on line 139 was always true
140 for _k in data["subject"]["_v"]:
141 keyword = create_subj(lang=xarticle.lang, value=_k)
142 xarticle.kwds.append(keyword)
143 # pdf
144 if not xarticle.url: 144 ↛ 145line 144 didn't jump to line 145 because the condition on line 144 was never true
145 raise ValueError(
146 f"[{self.source_domain}] {self.collection_id} : Article doesn't have an url"
147 )
148 if "uri" in data["identifier"]:
149 url = data["identifier"]["uri"]["_v"]
150 else:
151 url = "https://hdl.handle.net/10525/" + data["_alias"]["_v"]
152 self.logger.info(
153 f"Article URL not found, falling back with handle.net. Please check link validity : {url}",
154 extra={"pid": xarticle.pid},
155 )
156 xarticle.url = url
157 if len(data["media"]) == 0: 157 ↛ 158line 157 didn't jump to line 158 because the condition on line 157 was never true
158 raise ValueError(f"[{self.source_domain}] {self.collection_id} : PDF Not found")
159 for media in data["media"]:
160 add_pdf_link_to_xarticle(
161 xarticle, "https://buldml.math.bas.bg/asset/default/" + media["file"]["_v"]
162 )
163 return xarticle