Coverage for src/crawler/by_source/nsjom/nsjom_xml_crawler.py: 85%

126 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2025-01-15 14:09 +0000

1import re 

2import typing 

3 

4from bs4 import BeautifulSoup, Tag 

5 

6# from ptf.model_data import create_publisherdata 

7from ptf.model_data import ( 

8 IssueData, 

9 create_articledata, 

10 create_contributor, 

11 create_extlink, 

12 create_issuedata, 

13 create_publisherdata, 

14 create_subj, 

15) 

16 

17from crawler.utils import add_pdf_link_to_xarticle 

18 

19if typing.TYPE_CHECKING: 19 ↛ 20line 19 didn't jump to line 20 because the condition on line 19 was never true

20 from ..nsjom_crawler import NsjomCrawler 

21 

22source_domain = "NSJOM" 

23 

24 

25def parse_collection_content( 

26 self: "NsjomCrawler", 

27 _: str, 

28 periode_start: int = 0, 

29 periode_end: float = float("inf"), 

30 source_domain: str = "NSJOM", 

31 xissue_pid_to_parse: str | None = None, 

32): 

33 """ 

34 Parses all articles from one xml file : https://sites.dmi.uns.ac.rs/nsjom/NSJOM.xml 

35 From 2015 to today 

36 """ 

37 xissues: dict[tuple[str, str], IssueData] = {} 

38 url = "https://sites.dmi.uns.ac.rs/nsjom/NSJOM.xml" 

39 content = self.download_file(url) 

40 soup = BeautifulSoup(content, "lxml-xml") 

41 record_container_element = soup.select_one("records") 

42 if record_container_element is None: 42 ↛ 43line 42 didn't jump to line 43 because the condition on line 42 was never true

43 raise ValueError(f"[{source_domain}] Cannot parse source") 

44 for record_element in record_container_element.select("record"): 

45 publication_type_tag = record_element.select_one("publicationType") 

46 if publication_type_tag is None: 46 ↛ 47line 46 didn't jump to line 47 because the condition on line 46 was never true

47 raise ValueError(f"[{source_domain}] Cannot determine article publicationType") 

48 if publication_type_tag.text != "published": 

49 continue 

50 year_tag = record_element.select_one("year") 

51 if year_tag is None or year_tag.text == "": 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true

52 raise ValueError(f"[{source_domain}] Cannot parse year from article") 

53 year = int(year_tag.text) 

54 if periode_start > year or year > periode_end: 

55 continue 

56 xarticle, volume_number, issue_number = parse_article( 

57 self, record_element, source_domain=source_domain 

58 ) 

59 if (volume_number, issue_number) not in xissues: 

60 pid = f"{source_domain}_{year}__{volume_number}_{issue_number}" 

61 if xissue_pid_to_parse and xissue_pid_to_parse != pid: 

62 continue 

63 xissue = create_issuedata() 

64 parse_issue_tag(xissue, record_element, year) 

65 xissue.year = year_tag.text 

66 xissue.volume = volume_number 

67 xissue.number = issue_number 

68 xissue.pid = pid 

69 xissues[(volume_number, issue_number)] = xissue 

70 xissues[(volume_number, issue_number)].articles.append(xarticle) 

71 

72 return list(xissues.values()) 

73 

74 

75def parse_issue_content(self: "NsjomCrawler", content: str, xissue: IssueData): 

76 if not xissue.year: 76 ↛ 77line 76 didn't jump to line 77 because the condition on line 76 was never true

77 raise ValueError("Issue year is not set") 

78 return parse_collection_content( 

79 self, content, int(xissue.year), int(xissue.year), source_domain, xissue.pid 

80 ) 

81 

82 

83def parse_issue_tag(xissue: IssueData, article_tag: Tag, year: int) -> IssueData: 

84 publisher_tag = article_tag.select_one("publisher") 

85 if publisher_tag: 85 ↛ 90line 85 didn't jump to line 90 because the condition on line 85 was always true

86 xpub = create_publisherdata() 

87 xpub.name = publisher_tag.text 

88 xissue.publisher = xpub 

89 

90 ext_link = create_extlink( 

91 rel="source", 

92 location=f"https://sites.dmi.uns.ac.rs/nsjom/issue.html?year={year}", 

93 metadata=source_domain, 

94 ) 

95 xissue.ext_links.append(ext_link) 

96 return xissue 

97 

98 

99def parse_article(self: "NsjomCrawler", article_tag: Tag, source_domain: str = "NSJOM"): 

100 xarticle = create_articledata() 

101 

102 doi_tag = article_tag.select_one("doi") 

103 if doi_tag is None: 103 ↛ 104line 103 didn't jump to line 104 because the condition on line 103 was never true

104 raise ValueError(f"[{source_domain}] : Article doi not found") 

105 xarticle.doi = doi_tag.text 

106 xarticle.pid = re.sub("\\/\\.-", "_", doi_tag.text) 

107 

108 page_start_tag = article_tag.select_one("startPage") 

109 page_end_tag = article_tag.select_one("endPage") 

110 if page_start_tag: 110 ↛ 112line 110 didn't jump to line 112 because the condition on line 110 was always true

111 xarticle.fpage = page_start_tag.text 

112 if page_end_tag: 112 ↛ 115line 112 didn't jump to line 115 because the condition on line 112 was always true

113 xarticle.lpage = page_end_tag.text 

114 

115 date_published_tag = article_tag.select_one("publicationDate") 

116 if date_published_tag: 116 ↛ 119line 116 didn't jump to line 119 because the condition on line 116 was always true

117 xarticle.date_published_iso_8601_date_str = date_published_tag.text 

118 

119 url_tag = article_tag.select_one("publisherRecordId") 

120 if url_tag: 120 ↛ 128line 120 didn't jump to line 128 because the condition on line 120 was always true

121 ext_link = create_extlink( 

122 rel="source", 

123 location=f"https://sites.dmi.uns.ac.rs/nsjom/paper.html?noid={url_tag.text}", 

124 metadata=source_domain, 

125 ) 

126 xarticle.ext_links.append(ext_link) 

127 

128 title_tag = article_tag.select_one("title") 

129 if title_tag: 129 ↛ 134line 129 didn't jump to line 134 because the condition on line 129 was always true

130 xarticle.title_tex = title_tag.text 

131 

132 # TODO : Affiliations ? 

133 

134 authors_container = article_tag.select_one("authors") 

135 if authors_container: 135 ↛ 149line 135 didn't jump to line 149 because the condition on line 135 was always true

136 for author_tag in authors_container.select("author"): 

137 author = create_contributor(role="author") 

138 author_name_tag = author_tag.select_one("name") 

139 if author_name_tag: 139 ↛ 141line 139 didn't jump to line 141 because the condition on line 139 was always true

140 author["string_name"] = author_name_tag.text 

141 corresponding = author_tag.get("corresponding") 

142 if corresponding == "1": 

143 author["corresponding"] = True 

144 email_tag = author_tag.select_one("email") 

145 if email_tag: 

146 author["email"] = email_tag.text 

147 xarticle.contributors.append(author) 

148 

149 abstract_tag = article_tag.select_one("abstract") 

150 if abstract_tag: 150 ↛ 162line 150 didn't jump to line 162 because the condition on line 150 was always true

151 abstract_language = abstract_tag.get("langauge", None) 

152 if abstract_language is None or isinstance(abstract_language, list): 152 ↛ 154line 152 didn't jump to line 154 because the condition on line 152 was always true

153 abstract_language = "eng" 

154 xarticle.abstracts.append( 

155 { 

156 "tag": "abstract", 

157 "value_tex": abstract_tag.text, 

158 "lang": abstract_language or self.detect_language(abstract_tag.text) or "und", 

159 } 

160 ) 

161 

162 keywords_tag = article_tag.select_one("keywords") 

163 if keywords_tag: 163 ↛ 173line 163 didn't jump to line 173 because the condition on line 163 was always true

164 keywords_language = keywords_tag.get("language", "eng") 

165 if keywords_language is None or isinstance(keywords_language, list): 165 ↛ 166line 165 didn't jump to line 166 because the condition on line 165 was never true

166 keywords_language = "eng" 

167 for kwd_tag in keywords_tag.select("keyword"): 

168 subject = create_subj() 

169 subject["value"] = kwd_tag.text 

170 subject["lang"] = "en" 

171 xarticle.kwds.append(subject) 

172 

173 msc_tag = article_tag.select_one("MSCs") 

174 if msc_tag: 174 ↛ 182line 174 didn't jump to line 182 because the condition on line 174 was always true

175 for msc_subj in msc_tag.select("MSC"): 

176 subject = create_subj() 

177 subject["value"] = msc_subj.text 

178 subject["type"] = "msc" 

179 subject["lang"] = "en" 

180 xarticle.kwds.append(subject) 

181 

182 pdf_location_tag = article_tag.select_one("filelocation") 

183 pdf_name_tag = article_tag.select_one("file") 

184 if pdf_location_tag and pdf_name_tag: 

185 pdf_url = "https://sites.dmi.uns.ac.rs/nsjom/" + pdf_location_tag.text + pdf_name_tag.text 

186 add_pdf_link_to_xarticle(xarticle, pdf_url) 

187 

188 volume_tag = article_tag.select_one("volume") 

189 issue_tag = article_tag.select_one("issue") 

190 if volume_tag is None or issue_tag is None: 190 ↛ 191line 190 didn't jump to line 191 because the condition on line 190 was never true

191 raise ValueError( 

192 f"[{source_domain}] {xarticle.doi} Cannot parse volume or issue from article" 

193 ) 

194 

195 # Citations ? 

196 

197 return xarticle, volume_tag.text, issue_tag.text