Coverage for src/crawler/by_source/dmlbul_crawler.py: 94%
83 statements
« prev ^ index » next coverage.py v7.6.4, created at 2025-02-14 14:36 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2025-02-14 14:36 +0000
1import json
3import regex
4from ptf.model_data import (
5 ArticleData,
6 IssueData,
7 create_abstract,
8 create_articledata,
9 create_contributor,
10 create_subj,
11)
13from crawler.base_crawler import BaseCollectionCrawler
14from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
17# TODO : PID FCAA_2006_9_2_a5 : errata on the same page as article (2 pdfs)
18# Handle this by creating another article as an errata http://www.numdam.org/item/SEDP_1971-1972____A1_0/
19class DmlbulCrawler(BaseCollectionCrawler):
20 source_name = "Bulgarian Digital Mathematics Library"
21 source_domain = "DMLBUL"
22 source_website = "https://buldml.math.bas.bg"
24 periode_begin = 0
25 periode_end = 9999
27 # Regexes to detect issue metadata, from most specific to least
28 issue_regexes: list[str] = [
29 # MEM
30 r"[\w ]+, Vol\. (?P<volume>\d+), No (?P<number>[\d\-\w]+), \((?P<year>\d+)\)(?:, +(?P<pagestart>\d+)p-(?P<pageend>\d+)p)?",
31 r"\((?P<year>\d+)\).+, (?:(?P<volume>\d+)(?:\((?P<number>[\d\w\-−–]+)\))?), (?P<pagestart>\d+)(?: ?[\-−–] ?(?P<pageend>\d+))?\.",
32 # SMJ2
33 r"Volume (?P<volume>\d+), Number (?P<number>[\d\w\-]+), (?P<year>\d+), pp. (?P<pagestart>\d+)[\-−](?P<pageend>\d+)",
34 r"\((?P<year>\d+)\).+, (?:(?P<volume>\d+)(?:\((?P<number>[\d\w\-−–]+)\))?).",
35 ]
37 def parse_collection_content(self, content):
38 """
39 Parse the HTML page of Annals of Math and returns a list of xissue.
40 Each xissue has its pid/volume/number/year metadata + its url
42 self.periode is set at the end based on the xissue years of the HTML page
43 """
44 data = json.loads(content)
45 id = data["_id"]
46 articles = self.query_article_list(id)
47 issues = self.extract_issue_list(articles)
48 return issues
50 def query_article_list(self, id: str):
51 articles_data = []
52 count = float("inf")
53 while len(articles_data) < count:
54 request = self.session.post(
55 "https://buldml.math.bas.bg/api/object/search",
56 json={
57 "search": [
58 {"match": {"_t": "ct:Item", "category._rr": id, "*oid": ["category._rr"]}}
59 ],
60 "skip": len(articles_data),
61 "limit": min(200, count - len(articles_data)),
62 "add": {"identifier": "$identifier", "_alias": "$_alias"},
63 "sort": {"issue-date.year": 1, "_id": 1},
64 },
65 )
66 str_data = self.decode_response(request)
67 data = json.loads(str_data)
68 count = data["count"]
69 articles_data.extend(data["data"])
70 return articles_data
72 def extract_issue_list(self, article_list: list[dict]):
73 issues: dict[str, IssueData] = {}
74 for _a in article_list:
75 article = create_articledata()
76 issue_str = cleanup_str(_a["identifier"]["citation"]["_v"])
78 issue_re = None
79 re_index = 0
80 MAX_RETRIES = len(self.issue_regexes)
81 while issue_re is None and re_index < MAX_RETRIES:
82 issue_re = regex.search(self.issue_regexes[re_index], issue_str)
83 re_index += 1
85 if not issue_re: 85 ↛ 86line 85 didn't jump to line 86 because the condition on line 85 was never true
86 raise ValueError(
87 f"[{self.source_domain}] {self.collection_id} : Article cannot be parsed"
88 )
89 issue_dict = issue_re.groupdict()
91 if issue_dict.get("pagestart", None):
92 article.fpage = issue_dict["pagestart"]
93 if issue_dict.get("pageend", None):
94 article.lpage = issue_dict["pageend"]
96 article.url = "https://buldml.math.bas.bg/api/object/a/" + _a["_alias"]["_v"]
98 pid = self.get_issue_pid(
99 self.collection_id,
100 issue_dict["year"],
101 issue_dict["volume"],
102 issue_dict.get("number", None),
103 )
104 if pid in issues:
105 issue = issues[pid]
106 else:
107 issue = self.create_xissue(
108 None, issue_dict["year"], issue_dict["volume"], issue_dict.get("number", None)
109 )
110 issues[pid] = issue
112 article.pid = "a" + str(len(issues[pid].articles))
113 issue.articles.append(article)
114 return list(issues.values())
116 def parse_article_content(
117 self,
118 content: str,
119 xissue: IssueData,
120 xarticle: ArticleData,
121 url: str,
122 pid: str,
123 ):
124 data = json.loads(content)
125 xarticle.pid = pid
126 # title
127 xarticle.title_tex = data["title"]["_v"]
128 # lang
129 xarticle.lang = data["language"]["_v"]
130 if xarticle.lang == "other":
131 xarticle.lang = "und"
132 # authors
133 for _a in data["authors"]["_v"]:
134 author = create_contributor(string_name=_a, role="author")
135 xarticle.contributors.append(author)
136 # abstracts
137 if "abstract" in data:
138 xarticle.abstracts.append(
139 create_abstract(
140 lang=xarticle.lang, value_tex=data["abstract"]["_v"], tag="abstract"
141 )
142 )
143 # keywords
144 if "subject" in data: 144 ↛ 149line 144 didn't jump to line 149 because the condition on line 144 was always true
145 for _k in data["subject"]["_v"]:
146 keyword = create_subj(lang=xarticle.lang, value=_k)
147 xarticle.kwds.append(keyword)
148 # pdf
149 if not xarticle.url: 149 ↛ 150line 149 didn't jump to line 150 because the condition on line 149 was never true
150 raise ValueError(
151 f"[{self.source_domain}] {self.collection_id} : Article doesn't have an url"
152 )
153 if "uri" in data["identifier"]:
154 url = data["identifier"]["uri"]["_v"]
155 else:
156 url = "https://hdl.handle.net/10525/" + data["_alias"]["_v"]
157 print(
158 f"[{self.source_domain}] {xarticle.pid} : Article URL not found, falling back with handle.net. Please check link validity : {url}"
159 )
160 xarticle.url = url
161 if len(data["media"]) == 0: 161 ↛ 162line 161 didn't jump to line 162 because the condition on line 161 was never true
162 raise ValueError(f"[{self.source_domain}] {self.collection_id} : PDF Not found")
163 for media in data["media"]:
164 add_pdf_link_to_xarticle(
165 xarticle, "https://buldml.math.bas.bg/asset/default/" + media["file"]["_v"]
166 )
167 return xarticle