Coverage for src/crawler/by_source/dmlbul_crawler.py: 94%
80 statements
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
1import json
3import regex
4from ptf.model_data import (
5 ArticleData,
6 IssueData,
7 create_abstract,
8 create_articledata,
9 create_contributor,
10 create_subj,
11)
13from crawler.base_crawler import BaseCollectionCrawler
14from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
17# TODO : PID FCAA_2006_9_2_a5 : errata on the same page as article (2 pdfs)
18# Handle this by creating another article as an errata http://www.numdam.org/item/SEDP_1971-1972____A1_0/
19class DmlbulCrawler(BaseCollectionCrawler):
20 source_name = "Bulgarian Digital Mathematics Library"
21 source_domain = "DMLBUL"
22 source_website = "https://buldml.math.bas.bg"
24 # Regexes to detect issue metadata, from most specific to least
25 issue_regexes: list[str] = [
26 # MEM
27 r"[\w ]+, Vol\. (?P<volume>\d+), No (?P<number>[\d\-\w]+), \((?P<year>\d+)\)(?:, +(?P<pagestart>\d+)p-(?P<pageend>\d+)p)?",
28 r"\((?P<year>\d+)\).+, (?:(?P<volume>\d+)(?:\((?P<number>[\d\w\-−–]+)\))?), (?P<pagestart>\d+)(?: ?[\-−–] ?(?P<pageend>\d+))?\.",
29 # SMJ2
30 r"Volume (?P<volume>\d+), Number (?P<number>[\d\w\-]+), (?P<year>\d+), pp. (?P<pagestart>\d+)[\-−](?P<pageend>\d+)",
31 r"\((?P<year>\d+)\).+, (?:(?P<volume>\d+)(?:\((?P<number>[\d\w\-−–]+)\))?).",
32 ]
34 def parse_collection_content(self, content):
35 """
36 Parse the HTML page of Annals of Math and returns a list of xissue.
37 Each xissue has its pid/volume/number/year metadata + its url
38 """
39 data = json.loads(content)
40 id = data["_id"]
41 articles = self.query_article_list(id)
42 issues = self.extract_issue_list(articles)
43 return issues
45 def query_article_list(self, id: str):
46 articles_data = []
47 count = float("inf")
48 while len(articles_data) < count:
49 request = self.session.post(
50 "https://buldml.math.bas.bg/api/object/search",
51 json={
52 "search": [
53 {"match": {"_t": "ct:Item", "category._rr": id, "*oid": ["category._rr"]}}
54 ],
55 "skip": len(articles_data),
56 "limit": min(200, count - len(articles_data)),
57 "add": {"identifier": "$identifier", "_alias": "$_alias"},
58 "sort": {"issue-date.year": 1, "_id": 1},
59 },
60 )
61 str_data = self.decode_response(request)
62 data = json.loads(str_data)
63 count = data["count"]
64 articles_data.extend(data["data"])
65 return articles_data
67 def extract_issue_list(self, article_list: list[dict]):
68 issues: dict[str, IssueData] = {}
69 for _a in article_list:
70 article = create_articledata()
71 issue_str = cleanup_str(_a["identifier"]["citation"]["_v"])
73 issue_re = None
74 re_index = 0
75 MAX_RETRIES = len(self.issue_regexes)
76 while issue_re is None and re_index < MAX_RETRIES:
77 issue_re = regex.search(self.issue_regexes[re_index], issue_str)
78 re_index += 1
80 if not issue_re: 80 ↛ 81line 80 didn't jump to line 81 because the condition on line 80 was never true
81 raise ValueError(
82 f"[{self.source_domain}] {self.collection_id} : Article cannot be parsed"
83 )
84 issue_dict = issue_re.groupdict()
86 if issue_dict.get("pagestart", None):
87 article.fpage = issue_dict["pagestart"]
88 if issue_dict.get("pageend", None):
89 article.lpage = issue_dict["pageend"]
91 article.url = "https://buldml.math.bas.bg/api/object/a/" + _a["_alias"]["_v"]
93 pid = self.get_issue_pid(
94 self.collection_id,
95 issue_dict["year"],
96 issue_dict["volume"],
97 issue_dict.get("number", None),
98 )
99 if pid in issues:
100 issue = issues[pid]
101 else:
102 issue = self.create_xissue(
103 None, issue_dict["year"], issue_dict["volume"], issue_dict.get("number", None)
104 )
105 issues[pid] = issue
107 article.pid = "a" + str(len(issues[pid].articles))
108 issue.articles.append(article)
109 return list(issues.values())
111 def parse_article_content(
112 self,
113 content: str,
114 xissue: IssueData,
115 xarticle: ArticleData,
116 url: str,
117 ):
118 data = json.loads(content)
119 # title
120 xarticle.title_tex = data["title"]["_v"]
121 # lang
122 xarticle.lang = data["language"]["_v"]
123 if xarticle.lang == "other":
124 xarticle.lang = "und"
125 # authors
126 for _a in data["authors"]["_v"]:
127 author = create_contributor(string_name=_a, role="author")
128 xarticle.contributors.append(author)
129 # abstracts
130 if "abstract" in data:
131 xarticle.abstracts.append(
132 create_abstract(
133 lang=xarticle.lang, value_tex=data["abstract"]["_v"], tag="abstract"
134 )
135 )
136 # keywords
137 if "subject" in data: 137 ↛ 142line 137 didn't jump to line 142 because the condition on line 137 was always true
138 for _k in data["subject"]["_v"]:
139 keyword = create_subj(lang=xarticle.lang, value=_k)
140 xarticle.kwds.append(keyword)
141 # pdf
142 if not xarticle.url: 142 ↛ 143line 142 didn't jump to line 143 because the condition on line 142 was never true
143 raise ValueError(
144 f"[{self.source_domain}] {self.collection_id} : Article doesn't have an url"
145 )
146 if "uri" in data["identifier"]:
147 url = data["identifier"]["uri"]["_v"]
148 else:
149 url = "https://hdl.handle.net/10525/" + data["_alias"]["_v"]
150 print(
151 f"[{self.source_domain}] {xarticle.pid} : Article URL not found, falling back with handle.net. Please check link validity : {url}"
152 )
153 xarticle.url = url
154 if len(data["media"]) == 0: 154 ↛ 155line 154 didn't jump to line 155 because the condition on line 154 was never true
155 raise ValueError(f"[{self.source_domain}] {self.collection_id} : PDF Not found")
156 for media in data["media"]:
157 add_pdf_link_to_xarticle(
158 xarticle, "https://buldml.math.bas.bg/asset/default/" + media["file"]["_v"]
159 )
160 return xarticle