Coverage for src / crawler / by_source / nsjom / nsjom_1971_crawler.py: 83%
98 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-11 14:57 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-11 14:57 +0000
1import re
2import typing
3from datetime import datetime
5from bs4 import BeautifulSoup, Tag
6from ptf.model_data import (
7 IssueData,
8 create_articledata,
9 create_contributor,
10 create_extlink,
11 create_issuedata,
12)
14from crawler.crawler_utils import set_pages
15from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
17if typing.TYPE_CHECKING:
18 from .nsjom_crawler import NsjomCrawler
21source_domain = "NSJOM"
24def parse_collection_content(
25 self: "NsjomCrawler",
26 content: str,
27 source_domain: str = "NSJOM",
28):
29 """
30 Parses all articles from year-specific webpages : https://sites.dmi.uns.ac.rs/nsjom/ns1971.html
31 From 1971 to 2009 (included)
32 """
33 xissues: list[IssueData] = []
34 year_start = 1971
35 year_end = min(2009, datetime.now().year)
37 for year in range(year_start, year_end + 1):
38 url = f"https://sites.dmi.uns.ac.rs/nsjom/ns{year}.html"
39 year_content = self.download_file(url)
40 try:
41 xissues = xissues + parse_year(self, year_content, year, url)
42 except ValueError as e:
43 # Adds message to printed error
44 e.add_note(f"[{source_domain}]: {year}")
45 raise
46 return xissues
49def parse_issue_content(self, content: str, xissue: IssueData):
50 if not xissue.year: 50 ↛ 51line 50 didn't jump to line 51 because the condition on line 50 was never true
51 raise ValueError("Issue year is not set")
52 parse_year(self, content, int(xissue.year), xissue.url, xissue.pid)
55def parse_year(
56 self: "NsjomCrawler",
57 content: str,
58 year: int,
59 url: str | None = None,
60 pid_to_parse: str | None = None,
61):
62 """Parses one page.
63 eg : https://sites.dmi.uns.ac.rs/nsjom/ns2009.html
64 """
66 soup = BeautifulSoup(content, "html.parser")
67 xissues: list[IssueData] = []
68 issues_tags = soup.select("body>table")
69 for issue_tag in issues_tags:
70 xissue = parse_issue_tag(self, issue_tag, year)
71 if not xissue: 71 ↛ 72line 71 didn't jump to line 72 because the condition on line 71 was never true
72 continue
73 xissue.url = url
74 xissues.append(xissue)
75 return xissues
78def parse_issue_tag(
79 self: "NsjomCrawler", issue_tag: Tag, year: int, pid_to_parse: str | None = None
80):
81 """Parses one issue tag.
82 eg: `document.querySelector('body>table')` in https://sites.dmi.uns.ac.rs/nsjom/ns2009.html
83 """
84 xissue = create_issuedata()
85 xissue.year = str(year)
86 ext_link = create_extlink(
87 rel="source",
88 location=f"https://sites.dmi.uns.ac.rs/nsjom/issue.html?year={year}",
89 metadata=source_domain,
90 )
91 xissue.ext_links.append(ext_link)
93 table_lines = issue_tag.select("tr")
94 issue_title_tag = table_lines.pop(0)
95 match = re.search(
96 r"[\n\r ]*NSJOM[\n\r ]*Vol (?P<Volume>\d+)\.(?: No\. (?P<Issue>\d+))?",
97 issue_title_tag.text,
98 )
99 # Issue Summary ?
100 if match is None: 100 ↛ 101line 100 didn't jump to line 101 because the condition on line 100 was never true
101 raise ValueError("Cannot find volume number")
102 volume_number = match.group("Volume")
103 issue_number = match.group("Issue")
104 if volume_number: 104 ↛ 107line 104 didn't jump to line 107 because the condition on line 104 was always true
105 xissue.volume = volume_number
106 else:
107 raise ValueError("Cannot read volume number")
108 if issue_number:
109 xissue.number = issue_number
110 xissue.pid = f"{source_domain}_{year}_{volume_number}_{issue_number}"
112 if pid_to_parse and xissue.pid != pid_to_parse: 112 ↛ 113line 112 didn't jump to line 113 because the condition on line 112 was never true
113 return
114 table_lines.pop(0) # table header
115 for index, table_line_tag in enumerate(table_lines):
116 try:
117 xarticle = parse_article_tag(
118 self,
119 table_line_tag,
120 xissue.pid,
121 index,
122 f"https://sites.dmi.uns.ac.rs/nsjom/issue.html?year={year}",
123 )
124 except ValueError as e:
125 e.add_note(f"{volume_number}_{issue_number}")
126 raise
127 if xarticle:
128 xissue.articles.append(xarticle)
129 return xissue
132def parse_article_tag(
133 self: "NsjomCrawler", article_tag: Tag, issue_pid: str, index: int, source: str | None = None
134):
135 """Parses one article tag.
136 eg: `document.querySelector("body>table tr:nth-child(3)")` in https://sites.dmi.uns.ac.rs/nsjom/ns2009.html
137 """
138 xarticle = create_articledata()
139 if source: 139 ↛ 143line 139 didn't jump to line 143 because the condition on line 139 was always true
140 ext_link = create_extlink(rel="source", location=source, metadata=self.source_domain)
141 xarticle.ext_links.append(ext_link)
143 article_data = article_tag.select("td")
144 if len(article_data) != 3: 144 ↛ 145line 144 didn't jump to line 145 because the condition on line 144 was never true
145 raise ValueError("Issue table doesn't have three columns")
146 author_tag, title_tag, pages_tag = article_data
148 authors = [
149 BeautifulSoup(_, "html.parser").text.strip() for _ in str(author_tag).split("<br/>")
150 ]
151 for author_name in authors:
152 if author_name == "":
153 continue
155 author_name = cleanup_str(author_name)
157 author = create_contributor(role="author", string_name=author_name)
158 xarticle.contributors.append(author)
160 title = cleanup_str(title_tag.text)
161 xarticle.title_tex = title
162 xarticle.pid = f"{issue_pid}a_{index}"
164 title_link_tag = title_tag.select_one("a")
165 if title_link_tag is None:
166 self.logger.warning(
167 f"[{source_domain}] {issue_pid}_{index} : Cannot find article pdf link",
168 extra={"pid": xarticle.pid},
169 )
170 return None
171 pdf_link = title_link_tag.get("href")
172 if pdf_link is None: 172 ↛ 173line 172 didn't jump to line 173 because the condition on line 172 was never true
173 raise ValueError("Article pdf link is None")
174 if isinstance(pdf_link, list): 174 ↛ 175line 174 didn't jump to line 175 because the condition on line 174 was never true
175 raise ValueError("Article has multiple pdf hrefs")
176 pdf_link = self.source_website + pdf_link
177 add_pdf_link_to_xarticle(xarticle, pdf_link)
179 set_pages(xarticle, cleanup_str(pages_tag.text))
180 return xarticle