Coverage for src/crawler/by_source/nsjom/nsjom_xml_crawler.py: 86%

117 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-11-20 09:03 +0000

1import re 

2import typing 

3 

4from bs4 import BeautifulSoup, Tag 

5 

6# from ptf.model_data import create_publisherdata 

7from ptf.model_data import ( 

8 IssueData, 

9 create_articledata, 

10 create_contributor, 

11 create_extlink, 

12 create_issuedata, 

13 create_subj, 

14) 

15 

16from crawler.base_crawler import add_pdf_link_to_xarticle 

17 

18if typing.TYPE_CHECKING: 18 ↛ 19line 18 didn't jump to line 19 because the condition on line 18 was never true

19 from ..nsjom_crawler import NsjomCrawler 

20 

21source_domain = "NSJOM" 

22 

23 

24def parse_collection_content( 

25 self: "NsjomCrawler", 

26 _: str, 

27 periode_start: int = 0, 

28 periode_end: float = float("inf"), 

29 source_domain: str = "NSJOM", 

30 xissue_pid_to_parse: str | None = None, 

31): 

32 """ 

33 Parses all articles from one xml file : https://sites.dmi.uns.ac.rs/nsjom/NSJOM.xml 

34 From 2015 to today 

35 """ 

36 xissues: dict[tuple[str, str], IssueData] = {} 

37 url = "https://sites.dmi.uns.ac.rs/nsjom/NSJOM.xml" 

38 content = self.get_page_content(url) 

39 soup = BeautifulSoup(content, "lxml-xml") 

40 record_container_element = soup.select_one("records") 

41 if record_container_element is None: 41 ↛ 42line 41 didn't jump to line 42 because the condition on line 41 was never true

42 raise ValueError(f"[{source_domain}] Cannot parse source") 

43 for record_element in record_container_element.select("record"): 

44 publication_type_tag = record_element.select_one("publicationType") 

45 if publication_type_tag is None: 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true

46 raise ValueError(f"[{source_domain}] Cannot determine article publicationType") 

47 if publication_type_tag.text != "published": 

48 continue 

49 year_tag = record_element.select_one("year") 

50 if year_tag is None or year_tag.text == "": 50 ↛ 51line 50 didn't jump to line 51 because the condition on line 50 was never true

51 raise ValueError(f"[{source_domain}] Cannot parse year from article") 

52 year = int(year_tag.text) 

53 if periode_start > year or year > periode_end: 

54 continue 

55 xarticle, volume_number, issue_number = parse_article( 

56 record_element, source_domain=source_domain 

57 ) 

58 if (volume_number, issue_number) not in xissues: 

59 pid = f"{source_domain}_{year}__{volume_number}_{issue_number}" 

60 if xissue_pid_to_parse and xissue_pid_to_parse != pid: 

61 continue 

62 xissue = create_issuedata() 

63 parse_issue_tag(xissue, record_element, year) 

64 xissue.year = year_tag.text 

65 xissue.volume = volume_number 

66 xissue.number = issue_number 

67 xissue.pid = pid 

68 xissues[(volume_number, issue_number)] = xissue 

69 xissues[(volume_number, issue_number)].articles.append(xarticle) 

70 

71 return list(xissues.values()) 

72 

73 

74def parse_issue_content(self, content: str, xissue: IssueData): 

75 return parse_collection_content( 

76 self, content, int(xissue.year), int(xissue.year), source_domain, xissue.pid 

77 ) 

78 

79 

80def parse_issue_tag(xissue: IssueData, article_tag: Tag, year: int) -> IssueData: 

81 # publisher_tag = article_tag.select_one("publisher") 

82 # if publisher_tag: 

83 # xpub = create_publisherdata() 

84 # xpub.name = publisher_tag.text 

85 # xissue.publisher = xpub 

86 

87 ext_link = create_extlink( 

88 rel="source", 

89 location=f"https://sites.dmi.uns.ac.rs/nsjom/issue.html?year={year}", 

90 metadata=source_domain, 

91 ) 

92 xissue.ext_links.append(ext_link) 

93 return xissue 

94 

95 

96def parse_article(article_tag: Tag, source_domain: str = "NSJOM"): 

97 xarticle = create_articledata() 

98 

99 doi_tag = article_tag.select_one("doi") 

100 if doi_tag is None: 100 ↛ 101line 100 didn't jump to line 101 because the condition on line 100 was never true

101 raise ValueError(f"[{source_domain}] : Article doi not found") 

102 xarticle.doi = doi_tag.text 

103 xarticle.pid = re.sub("\\/\\.-", "_", doi_tag.text) 

104 

105 page_start_tag = article_tag.select_one("startPage") 

106 page_end_tag = article_tag.select_one("endPage") 

107 if page_start_tag and page_end_tag: 107 ↛ 110line 107 didn't jump to line 110 because the condition on line 107 was always true

108 xarticle.page_range = page_start_tag.text + " - " + page_end_tag.text 

109 

110 date_published_tag = article_tag.select_one("publicationDate") 

111 if date_published_tag: 111 ↛ 114line 111 didn't jump to line 114 because the condition on line 111 was always true

112 xarticle.date_published_iso_8601_date_str = date_published_tag.text 

113 

114 url_tag = article_tag.select_one("publisherRecordId") 

115 if url_tag: 115 ↛ 123line 115 didn't jump to line 123 because the condition on line 115 was always true

116 ext_link = create_extlink( 

117 rel="source", 

118 location=f"https://sites.dmi.uns.ac.rs/nsjom/paper.html?noid={url_tag.text}", 

119 metadata=source_domain, 

120 ) 

121 xarticle.ext_links.append(ext_link) 

122 

123 title_tag = article_tag.select_one("title") 

124 if title_tag: 124 ↛ 129line 124 didn't jump to line 129 because the condition on line 124 was always true

125 xarticle.title_tex = title_tag.text 

126 

127 # TODO : Affiliations ? 

128 

129 authors_container = article_tag.select_one("authors") 

130 if authors_container: 130 ↛ 144line 130 didn't jump to line 144 because the condition on line 130 was always true

131 for author_tag in authors_container.select("author"): 

132 author = create_contributor(role="author") 

133 author_name_tag = author_tag.select_one("name") 

134 if author_name_tag: 134 ↛ 136line 134 didn't jump to line 136 because the condition on line 134 was always true

135 author["string_name"] = author_name_tag.text 

136 corresponding = author_tag.get("corresponding") 

137 if corresponding == "1": 

138 author["corresponding"] = True 

139 email_tag = author_tag.select_one("email") 

140 if email_tag: 

141 author["email"] = email_tag.text 

142 xarticle.contributors.append(author) 

143 

144 abstract_tag = article_tag.select_one("abstract") 

145 if abstract_tag: 145 ↛ 153line 145 didn't jump to line 153 because the condition on line 145 was always true

146 abstract_language = abstract_tag.get("langauge", "eng") 

147 if abstract_language is None or isinstance(abstract_language, list): 147 ↛ 148line 147 didn't jump to line 148 because the condition on line 147 was never true

148 abstract_language = "eng" 

149 xarticle.abstracts.append( 

150 {"tag": "abstract", "value_tex": abstract_tag.text, "lang": abstract_language} 

151 ) 

152 

153 keywords_tag = article_tag.select_one("keywords") 

154 if keywords_tag: 154 ↛ 164line 154 didn't jump to line 164 because the condition on line 154 was always true

155 keywords_language = keywords_tag.get("language", "eng") 

156 if keywords_language is None or isinstance(keywords_language, list): 156 ↛ 157line 156 didn't jump to line 157 because the condition on line 156 was never true

157 keywords_language = "eng" 

158 for kwd_tag in keywords_tag.select("keyword"): 

159 subject = create_subj() 

160 subject["value"] = kwd_tag.text 

161 subject["lang"] = "en" 

162 xarticle.kwds.append(subject) 

163 

164 msc_tag = article_tag.select_one("MSCs") 

165 if msc_tag: 165 ↛ 173line 165 didn't jump to line 173 because the condition on line 165 was always true

166 for msc_subj in msc_tag.select("MSC"): 

167 subject = create_subj() 

168 subject["value"] = msc_subj.text 

169 subject["type"] = "msc" 

170 subject["lang"] = "en" 

171 xarticle.kwds.append(subject) 

172 

173 pdf_location_tag = article_tag.select_one("filelocation") 

174 pdf_name_tag = article_tag.select_one("file") 

175 if pdf_location_tag and pdf_name_tag: 

176 pdf_url = "https://sites.dmi.uns.ac.rs/nsjom/" + pdf_location_tag.text + pdf_name_tag.text 

177 add_pdf_link_to_xarticle(xarticle, pdf_url) 

178 

179 volume_tag = article_tag.select_one("volume") 

180 issue_tag = article_tag.select_one("issue") 

181 if volume_tag is None or issue_tag is None: 181 ↛ 182line 181 didn't jump to line 182 because the condition on line 181 was never true

182 raise ValueError( 

183 f"[{source_domain}] {xarticle.doi} Cannot parse volume or issue from article" 

184 ) 

185 

186 # Citations ? 

187 

188 return xarticle, volume_tag.text, issue_tag.text