Coverage for src/crawler/by_source/nsjom/nsjom_2010

1import re

2import typing

3from datetime import datetime

5from bs4 import BeautifulSoup, Tag

6from ptf.model_data import (

7 IssueData,

8 create_articledata,

9 create_contributor,

10 create_extlink,

11 create_issuedata,

12)

14from crawler.utils import add_pdf_link_to_xarticle, cleanup_str

16if typing.TYPE_CHECKING: 16 ↛ 17line 16 didn't jump to line 17 because the condition on line 16 was never true

17 from .nsjom_crawler import NsjomCrawler

19source_domain = "NSJOM"

22def parse_collection_content(

23 self: "NsjomCrawler",

24 content: str,

25 source_domain: str = "NSJOM",

26):

27 """

28 Parses all articles from year-specific webpages : https://sites.dmi.uns.ac.rs/nsjom/ns2010.html

29 From 2010 to 2014 (included)

30 """

31 xissues: list[IssueData] = []

32 year_start = 2010

33 year_end = min(2014, datetime.now().year)

35 for year in range(year_start, year_end + 1):

36 url = f"https://sites.dmi.uns.ac.rs/nsjom/ns{year}.html"

37 year_content = self.download_file(url)

38 xissues = xissues + parse_year(self, year_content, year, url)

40 return xissues

43def is_heading(element: Tag):

44 if element.select_one(".HeadingNSJOM"):

45 return True

46 classname = element.get("class")

47 if not classname:

48 return False

50 if isinstance(classname, str): 50 ↛ 51line 50 didn't jump to line 51 because the condition on line 50 was never true

51 if classname == "HeadingNSJOM":

52 return True

53 return False

55 if "HeadingNSJOM" in classname:

56 return True

57 return False

60def parse_issue_content(self, content: str, xissue: IssueData):

61 if not xissue.year:

62 raise ValueError("Issue year is not set")

63 parse_year(self, content, int(xissue.year), xissue.url, xissue.pid)

66def parse_year(

67 self: "NsjomCrawler",

68 content: str,

69 year: int,

70 url: str | None = None,

71 pid_to_crawl: str | None = None,

72):

73 soup = BeautifulSoup(content, "html.parser")

74 page_elements = soup.select("p.HeadingNSJOM, p.style1, p.style1+blockquote a")

75 issues: list[list[Tag]] = []

76 # Sort tags into issues

77 for current_element in page_elements:

78 if is_heading(current_element):

79 issues.append([current_element])

80 continue

81 if current_element.text == "\xa0":

82 continue

83 issues[-1].append(current_element)

85 xissues: list[IssueData] = []

86 for issue_elements in issues:

87 xissue = parse_issue_tags(self, issue_elements, year, pid_to_crawl)

88 if not xissue: 88 ↛ 89line 88 didn't jump to line 89 because the condition on line 88 was never true

89 continue

90 xissue.url = url

91 xissues.append(xissue)

93 return xissues

96def parse_issue_tags(

97 self: "NsjomCrawler", tags: list[Tag], year: int, pid_to_crawl: str | None = None

98):

99 xissue = create_issuedata()

100 xissue.year = str(year)

101 ext_link = create_extlink(

102 rel="source",

103 location=f"https://sites.dmi.uns.ac.rs/nsjom/issue.html?year={year}",

104 metadata=source_domain,

105 )

106 xissue.ext_links.append(ext_link)

107 issue_title_tag = tags.pop(0)

108 match = re.search(

109 r"NSJOM Vol\. (?P<Volume>\d+)(?:, No. (?P<Issue>\d+))?", issue_title_tag.text

110 )

111 if match is None: 111 ↛ 112line 111 didn't jump to line 112 because the condition on line 111 was never true

112 raise ValueError("Cannot find volume number")

113 volume_number = match.group("Volume")

114 issue_number = match.group("Issue")

115 if volume_number: 115 ↛ 118line 115 didn't jump to line 118 because the condition on line 115 was always true

116 xissue.volume = volume_number

117 else:

118 raise ValueError("Cannot read volume number")

119 if issue_number: 119 ↛ 121line 119 didn't jump to line 121 because the condition on line 119 was always true

120 xissue.number = issue_number

121 xissue.pid = f"{source_domain}_{year}_{volume_number}_{issue_number}"

122

123 if pid_to_crawl and xissue.pid != pid_to_crawl: 123 ↛ 124line 123 didn't jump to line 124 because the condition on line 123 was never true

124 return

125

126 while len(tags) > 0:

127 article_meta = tags.pop(0)

128 article_meta_text = cleanup_str(article_meta.text)

129 if article_meta_text == "": 129 ↛ 130line 129 didn't jump to line 130 because the condition on line 129 was never true

130 continue

131

132 article_title = tags.pop(0)

133 article = parse_article(

134 self,

135 article_meta_text,

136 article_title,

137 xissue.pid,

138 len(xissue.articles),

139 f"https://sites.dmi.uns.ac.rs/nsjom/issue.html?year={year}",

140 )

141 if article is None: 141 ↛ 142line 141 didn't jump to line 142 because the condition on line 141 was never true

142 continue

143 xissue.articles.append(article)

144 return xissue

145

146

147def parse_article(

148 self: "NsjomCrawler",

149 meta_text: str,

150 title_tag: Tag,

151 issue_pid: str,

152 index: int,

153 source: str | None = None,

154):

155 xarticle = create_articledata()

156 xarticle.pid = f"{issue_pid}a_{index}"

157 if source: 157 ↛ 160line 157 didn't jump to line 160 because the condition on line 157 was always true

158 ext_link = create_extlink(rel="source", location=source, metadata=self.source_domain)

159 xarticle.ext_links.append(ext_link)

160 match = re.search(r"\d+ \/ \d+ \/ \d+ \/ (?P<Pages>\d+(?:-\d+)?): (?P<Authors>.+)", meta_text)

161 if match is None: 161 ↛ 162line 161 didn't jump to line 162 because the condition on line 161 was never true

162 raise ValueError("Cannot parse authors or page number")

163 self.set_pages(xarticle, match.group("Pages"))

164 authors = re.findall(r"(?: and )?((?:(?<!,)(?<! and).(?!and ))+)", match.group("Authors"))

165 for a in authors:

166 author = create_contributor(role="author", string_name=a)

167 xarticle.contributors.append(author)

168

169 article_pdf_link = title_tag.get("href")

170 if article_pdf_link is None: 170 ↛ 171line 170 didn't jump to line 171 because the condition on line 170 was never true

171 self.logger.debug("Article does not have a pdf", extra={"pid": xarticle.pid})

172 return None

173 if isinstance(article_pdf_link, list): 173 ↛ 174line 173 didn't jump to line 174 because the condition on line 173 was never true

174 raise ValueError("Article link is a list")

175 pdf_link = self.source_website + article_pdf_link

176 add_pdf_link_to_xarticle(xarticle, pdf_link)

177

178 title = cleanup_str(title_tag.text)

179 xarticle.title_tex = title

180

181 return xarticle

Coverage for src/crawler/by_source/nsjom/nsjom_2010_crawler.py: 79%

109 statements