Coverage for src/crawler/by_source/dmlbul_crawler.py: 94%
83 statements
« prev ^ index » next coverage.py v7.6.4, created at 2025-01-15 14:09 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2025-01-15 14:09 +0000
1import json
3import regex
4from ptf.model_data import (
5 ArticleData,
6 IssueData,
7 create_abstract,
8 create_articledata,
9 create_contributor,
10 create_subj,
11)
13from crawler.base_crawler import BaseCollectionCrawler
14from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
17class DmlbulCrawler(BaseCollectionCrawler):
18 source_name = "Bulgarian Digital Mathematics Library"
19 source_domain = "DMLBUL"
20 source_website = "https://buldml.math.bas.bg"
22 periode_begin = 0
23 periode_end = 9999
25 # Regexes to detect issue metadata, from most specific to least
26 issue_regexes: list[str] = [
27 # MEM
28 r"[\w ]+, Vol\. (?P<volume>\d+), No (?P<number>[\d\-\w]+), \((?P<year>\d+)\)(?:, +(?P<pagestart>\d+)p-(?P<pageend>\d+)p)?",
29 r"\((?P<year>\d+)\).+, (?:(?P<volume>\d+)(?:\((?P<number>[\d\w\-−–]+)\))?), (?P<pagestart>\d+)(?: ?[\-−–] ?(?P<pageend>\d+))?\.",
30 # SMJ2
31 r"Volume (?P<volume>\d+), Number (?P<number>[\d\w\-]+), (?P<year>\d+), pp. (?P<pagestart>\d+)[\-−](?P<pageend>\d+)",
32 r"\((?P<year>\d+)\).+, (?:(?P<volume>\d+)(?:\((?P<number>[\d\w\-−–]+)\))?).",
33 ]
35 def parse_collection_content(self, content):
36 """
37 Parse the HTML page of Annals of Math and returns a list of xissue.
38 Each xissue has its pid/volume/number/year metadata + its url
40 self.periode is set at the end based on the xissue years of the HTML page
41 """
42 data = json.loads(content)
43 id = data["_id"]
44 articles = self.query_article_list(id)
45 issues = self.extract_issue_list(articles)
46 return issues
48 def query_article_list(self, id: str):
49 articles_data = []
50 count = float("inf")
51 while len(articles_data) < count:
52 request = self.session.post(
53 "https://buldml.math.bas.bg/api/object/search",
54 json={
55 "search": [
56 {"match": {"_t": "ct:Item", "category._rr": id, "*oid": ["category._rr"]}}
57 ],
58 "skip": len(articles_data),
59 "limit": min(200, count - len(articles_data)),
60 "add": {"identifier": "$identifier", "_alias": "$_alias"},
61 "sort": {"issue-date.year": 1, "_id": 1},
62 },
63 )
64 str_data = self.decode_response(request)
65 data = json.loads(str_data)
66 count = data["count"]
67 articles_data.extend(data["data"])
68 return articles_data
70 def extract_issue_list(self, article_list: list[dict]):
71 issues: dict[str, IssueData] = {}
72 for _a in article_list:
73 article = create_articledata()
74 issue_str = cleanup_str(_a["identifier"]["citation"]["_v"])
76 issue_re = None
77 re_index = 0
78 MAX_RETRIES = len(self.issue_regexes)
79 while issue_re is None and re_index < MAX_RETRIES:
80 issue_re = regex.search(self.issue_regexes[re_index], issue_str)
81 re_index += 1
83 if not issue_re: 83 ↛ 84line 83 didn't jump to line 84 because the condition on line 83 was never true
84 raise ValueError(
85 f"[{self.source_domain}] {self.collection_id} : Article cannot be parsed"
86 )
87 issue_dict = issue_re.groupdict()
89 if issue_dict.get("pagestart", None):
90 article.fpage = issue_dict["pagestart"]
91 if issue_dict.get("pageend", None):
92 article.lpage = issue_dict["pageend"]
94 article.url = "https://buldml.math.bas.bg/api/object/a/" + _a["_alias"]["_v"]
96 pid = self.get_issue_pid(
97 self.collection_id,
98 issue_dict["year"],
99 issue_dict["volume"],
100 issue_dict.get("number", None),
101 )
102 if pid in issues:
103 issue = issues[pid]
104 else:
105 issue = self.create_xissue(
106 None, issue_dict["year"], issue_dict["volume"], issue_dict.get("number", None)
107 )
108 issues[pid] = issue
110 article.pid = "a" + str(len(issues[pid].articles))
111 issue.articles.append(article)
112 return list(issues.values())
114 def parse_article_content(
115 self,
116 content: str,
117 xissue: IssueData,
118 xarticle: ArticleData,
119 url: str,
120 pid: str,
121 ):
122 data = json.loads(content)
123 xarticle.pid = pid
124 # title
125 xarticle.title_tex = data["title"]["_v"]
126 # lang
127 xarticle.lang = data["language"]["_v"]
128 if xarticle.lang == "other":
129 xarticle.lang = "und"
130 # authors
131 for _a in data["authors"]["_v"]:
132 author = create_contributor(string_name=_a, role="author")
133 xarticle.contributors.append(author)
134 # abstracts
135 if "abstract" in data:
136 xarticle.abstracts.append(
137 create_abstract(
138 lang=xarticle.lang, value_tex=data["abstract"]["_v"], tag="abstract"
139 )
140 )
141 # keywords
142 if "subject" in data: 142 ↛ 147line 142 didn't jump to line 147 because the condition on line 142 was always true
143 for _k in data["subject"]["_v"]:
144 keyword = create_subj(lang=xarticle.lang, value=_k)
145 xarticle.kwds.append(keyword)
146 # pdf
147 if not xarticle.url: 147 ↛ 148line 147 didn't jump to line 148 because the condition on line 147 was never true
148 raise ValueError(
149 f"[{self.source_domain}] {self.collection_id} : Article doesn't have an url"
150 )
151 if "uri" in data["identifier"]:
152 url = data["identifier"]["uri"]["_v"]
153 else:
154 url = "https://hdl.handle.net/10525/" + data["_alias"]["_v"]
155 print(
156 f"[{self.source_domain}] {xarticle.pid} : Article URL not found, falling back with handle.net. Please check link validity : {url}"
157 )
158 xarticle.url = url
159 if len(data["media"]) == 0: 159 ↛ 160line 159 didn't jump to line 160 because the condition on line 159 was never true
160 raise ValueError(f"[{self.source_domain}] {self.collection_id} : PDF Not found")
161 for media in data["media"]:
162 add_pdf_link_to_xarticle(
163 xarticle, "https://buldml.math.bas.bg/asset/default/" + media["file"]["_v"]
164 )
165 return xarticle