Coverage for src / crawler / by_source / dmlbul_crawler.py: 13%

82 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-23 15:27 +0000

1import json 

2 

3import regex 

4from ptf.model_data import ( 

5 ArticleData, 

6 IssueData, 

7 create_abstract, 

8 create_articledata, 

9 create_contributor, 

10 create_subj, 

11) 

12 

13from crawler.base_crawler import BaseCollectionCrawler 

14from crawler.crawler_utils import get_issue_pid 

15from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

16 

17 

18# TODO : PID FCAA_2006_9_2_a5 : errata on the same page as article (2 pdfs) 

19# Handle this by creating another article as an errata https://www.numdam.org/item/SEDP_1971-1972____A1_0/ 

20class DmlbulCrawler(BaseCollectionCrawler): 

21 source_name = "Bulgarian Digital Mathematics Library" 

22 source_domain = "DMLBUL" 

23 source_website = "https://buldml.math.bas.bg" 

24 

25 # Regexes to detect issue metadata, from most specific to least 

26 issue_regexes: list[str] = [ 

27 # MEM 

28 r"[\w ]+, Vol\. (?P<volume>\d+), No (?P<number>[\d\-\w]+), \((?P<year>\d+)\)(?:, +(?P<pagestart>\d+)p-(?P<pageend>\d+)p)?", 

29 r"\((?P<year>\d+)\).+, (?:(?P<volume>\d+)(?:\((?P<number>[\d\w\-−–]+)\))?), (?P<pagestart>\d+)(?: ?[\-−–] ?(?P<pageend>\d+))?\.", 

30 # SMJ2 

31 r"Volume (?P<volume>\d+), Number (?P<number>[\d\w\-]+), (?P<year>\d+), pp. (?P<pagestart>\d+)[\-−](?P<pageend>\d+)", 

32 r"\((?P<year>\d+)\).+, (?:(?P<volume>\d+)(?:\((?P<number>[\d\w\-−–]+)\))?).", 

33 ] 

34 

35 def parse_collection_content(self, content): 

36 """ 

37 Parse the HTML page of Annals of Math and returns a list of xissue. 

38 Each xissue has its pid/volume/number/year metadata + its url 

39 """ 

40 data = json.loads(content) 

41 id = data["_id"] 

42 articles = self.query_article_list(id) 

43 issues = self.extract_issue_list(articles) 

44 return issues 

45 

46 def query_article_list(self, id: str): 

47 articles_data = [] 

48 count = float("inf") 

49 while len(articles_data) < count: 

50 request = self.session.post( 

51 "https://buldml.math.bas.bg/api/object/search", 

52 json={ 

53 "search": [ 

54 {"match": {"_t": "ct:Item", "category._rr": id, "*oid": ["category._rr"]}} 

55 ], 

56 "skip": len(articles_data), 

57 "limit": min(200, count - len(articles_data)), 

58 "add": {"identifier": "$identifier", "_alias": "$_alias"}, 

59 "sort": {"issue-date.year": 1, "_id": 1}, 

60 }, 

61 ) 

62 str_data = self.decode_response(request) 

63 data = json.loads(str_data) 

64 count = data["count"] 

65 articles_data.extend(data["data"]) 

66 return articles_data 

67 

68 def extract_issue_list(self, article_list: list[dict]): 

69 issues: dict[str, IssueData] = {} 

70 for _a in article_list: 

71 article = create_articledata() 

72 issue_str = cleanup_str(_a["identifier"]["citation"]["_v"]) 

73 

74 issue_re = None 

75 re_index = 0 

76 MAX_RETRIES = len(self.issue_regexes) 

77 while issue_re is None and re_index < MAX_RETRIES: 

78 issue_re = regex.search(self.issue_regexes[re_index], issue_str) 

79 re_index += 1 

80 

81 if not issue_re: 

82 self.logger.error( 

83 f"[{self.source_domain}] {self.collection_id} : Article cannot be parsed", 

84 extra={"input": _a}, 

85 ) 

86 continue 

87 issue_dict = issue_re.groupdict() 

88 

89 if issue_dict.get("pagestart", None): 

90 article.fpage = issue_dict["pagestart"] 

91 if issue_dict.get("pageend", None): 

92 article.lpage = issue_dict["pageend"] 

93 

94 article.url = "https://buldml.math.bas.bg/api/object/a/" + _a["_alias"]["_v"] 

95 

96 pid = get_issue_pid( 

97 self.collection_id, 

98 issue_dict["year"], 

99 issue_dict["volume"], 

100 issue_dict.get("number", None), 

101 ) 

102 if pid in issues: 

103 issue = issues[pid] 

104 else: 

105 issue = self.create_xissue( 

106 None, issue_dict["year"], issue_dict["volume"], issue_dict.get("number", None) 

107 ) 

108 issues[pid] = issue 

109 

110 article.pid = "a" + str(len(issues[pid].articles)) 

111 issue.articles.append(article) 

112 return list(issues.values()) 

113 

114 def parse_article_content( 

115 self, 

116 content: str, 

117 xissue: IssueData, 

118 xarticle: ArticleData, 

119 url: str, 

120 ): 

121 data = json.loads(content) 

122 # title 

123 xarticle.title_tex = data["title"]["_v"] 

124 # lang 

125 xarticle.lang = data["language"]["_v"] 

126 if xarticle.lang == "other": 

127 xarticle.lang = "und" 

128 # authors 

129 for _a in data["authors"]["_v"]: 

130 author = create_contributor(string_name=_a, role="author") 

131 xarticle.contributors.append(author) 

132 # abstracts 

133 if "abstract" in data: 

134 xarticle.abstracts.append( 

135 create_abstract(lang=xarticle.lang, value_tex=data["abstract"]["_v"]) 

136 ) 

137 # keywords 

138 if "subject" in data: 

139 for _k in data["subject"]["_v"]: 

140 keyword = create_subj(lang=xarticle.lang, value=_k) 

141 xarticle.kwds.append(keyword) 

142 # pdf 

143 if not xarticle.url: 

144 raise ValueError( 

145 f"[{self.source_domain}] {self.collection_id} : Article doesn't have an url" 

146 ) 

147 if "uri" in data["identifier"]: 

148 url = data["identifier"]["uri"]["_v"] 

149 else: 

150 url = "https://hdl.handle.net/10525/" + data["_alias"]["_v"] 

151 self.logger.info( 

152 f"Article URL not found, falling back with handle.net. Please check link validity : {url}", 

153 extra={"pid": xarticle.pid}, 

154 ) 

155 xarticle.url = url 

156 if len(data["media"]) == 0: 

157 raise ValueError(f"[{self.source_domain}] {self.collection_id} : PDF Not found") 

158 for media in data["media"]: 

159 add_pdf_link_to_xarticle( 

160 xarticle, "https://buldml.math.bas.bg/asset/default/" + media["file"]["_v"] 

161 ) 

162 return xarticle