Coverage for src/crawler/by_source/nsjom/nsjom_1971_crawler.py: 81%
99 statements
« prev ^ index » next coverage.py v7.6.4, created at 2025-01-15 14:09 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2025-01-15 14:09 +0000
1import re
2import typing
3from datetime import datetime
5from bs4 import BeautifulSoup, Tag
6from ptf.model_data import (
7 IssueData,
8 create_articledata,
9 create_contributor,
10 create_extlink,
11 create_issuedata,
12)
14from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
16if typing.TYPE_CHECKING: 16 ↛ 17line 16 didn't jump to line 17 because the condition on line 16 was never true
17 from ..nsjom_crawler import NsjomCrawler
20source_domain = "NSJOM"
23def parse_collection_content(
24 self: "NsjomCrawler",
25 content: str,
26 periode_start: int = 0,
27 periode_end: int = datetime.now().year,
28 source_domain: str = "NSJOM",
29):
30 """
31 Parses all articles from year-specific webpages : https://sites.dmi.uns.ac.rs/nsjom/ns1971.html
32 From 1971 to 2009 (included)
33 """
34 xissues: list[IssueData] = []
35 year_start = max(1971, periode_start)
36 year_end = min(2009, periode_end)
38 for year in range(year_start, year_end + 1):
39 url = f"https://sites.dmi.uns.ac.rs/nsjom/ns{year}.html"
40 year_content = self.download_file(url)
41 try:
42 xissues = xissues + parse_year(self, year_content, year, url)
43 except ValueError as e:
44 # Adds message to printed error
45 e.add_note(f"[{source_domain}]: {year}")
46 raise
47 return xissues
50def parse_issue_content(self, content: str, xissue: IssueData):
51 if not xissue.year: 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true
52 raise ValueError("Issue year is not set")
53 parse_year(self, content, int(xissue.year), xissue.url, xissue.pid)
56def parse_year(
57 self: "NsjomCrawler",
58 content: str,
59 year: int,
60 url: str | None = None,
61 pid_to_parse: str | None = None,
62):
63 """Parses one page.
64 eg : https://sites.dmi.uns.ac.rs/nsjom/ns2009.html"""
65 soup = BeautifulSoup(content, "html.parser")
66 xissues: list[IssueData] = []
67 issues_tags = soup.select("body>table")
68 for issue_tag in issues_tags:
69 xissue = parse_issue_tag(self, issue_tag, year)
70 if not xissue: 70 ↛ 71line 70 didn't jump to line 71 because the condition on line 70 was never true
71 continue
72 xissue.url = url
73 xissues.append(xissue)
74 return xissues
77def parse_issue_tag(
78 self: "NsjomCrawler", issue_tag: Tag, year: int, pid_to_parse: str | None = None
79):
80 """Parses one issue tag.
81 eg: `document.querySelector('body>table')` in https://sites.dmi.uns.ac.rs/nsjom/ns2009.html
82 """
83 xissue = create_issuedata()
84 xissue.year = str(year)
85 ext_link = create_extlink(
86 rel="source",
87 location=f"https://sites.dmi.uns.ac.rs/nsjom/issue.html?year={year}",
88 metadata=source_domain,
89 )
90 xissue.ext_links.append(ext_link)
92 table_lines = issue_tag.select("tr")
93 issue_title_tag = table_lines.pop(0)
94 match = re.search(
95 r"[\n\r ]*NSJOM[\n\r ]*Vol (?P<Volume>\d+)\.(?: No\. (?P<Issue>\d+))?",
96 issue_title_tag.text,
97 )
98 # Issue Summary ?
99 if match is None: 99 ↛ 100line 99 didn't jump to line 100 because the condition on line 99 was never true
100 raise ValueError("Cannot find volume number")
101 volume_number = match.group("Volume")
102 issue_number = match.group("Issue")
103 if volume_number: 103 ↛ 106line 103 didn't jump to line 106 because the condition on line 103 was always true
104 xissue.volume = volume_number
105 else:
106 raise ValueError("Cannot read volume number")
107 if issue_number:
108 xissue.number = issue_number
109 xissue.pid = f"{source_domain}_{year}_{volume_number}_{issue_number}"
111 if pid_to_parse and xissue.pid != pid_to_parse: 111 ↛ 112line 111 didn't jump to line 112 because the condition on line 111 was never true
112 return
113 table_lines.pop(0) # table header
114 for index, table_line_tag in enumerate(table_lines):
115 try:
116 xarticle = parse_article_tag(
117 self,
118 table_line_tag,
119 xissue.pid,
120 index,
121 f"https://sites.dmi.uns.ac.rs/nsjom/issue.html?year={year}",
122 )
123 except ValueError as e:
124 e.add_note(f"{volume_number}_{issue_number}")
125 raise
126 if xarticle:
127 xissue.articles.append(xarticle)
128 return xissue
131def parse_article_tag(
132 self: "NsjomCrawler", article_tag: Tag, issue_pid: str, index: int, source: str | None = None
133):
134 """Parses one article tag.
135 eg: `document.querySelector("body>table tr:nth-child(3)")` in https://sites.dmi.uns.ac.rs/nsjom/ns2009.html
136 """
137 xarticle = create_articledata()
138 if source: 138 ↛ 142line 138 didn't jump to line 142 because the condition on line 138 was always true
139 ext_link = create_extlink(rel="source", location=source, metadata=self.source_domain)
140 xarticle.ext_links.append(ext_link)
142 article_data = article_tag.select("td")
143 if len(article_data) != 3: 143 ↛ 144line 143 didn't jump to line 144 because the condition on line 143 was never true
144 raise ValueError("Issue table doesn't have three columns")
145 author_tag, title_tag, pages_tag = article_data
147 authors = [
148 BeautifulSoup(_, "html.parser").text.strip() for _ in str(author_tag).split("<br/>")
149 ]
150 for author_name in authors:
151 if author_name == "":
152 continue
154 author_name = cleanup_str(author_name)
156 author = create_contributor(role="author", string_name=author_name)
157 xarticle.contributors.append(author)
159 title = cleanup_str(title_tag.text)
160 xarticle.title_tex = title
161 xarticle.pid = f"{issue_pid}a_{index}"
163 title_link_tag = title_tag.select_one("a")
164 if title_link_tag is None:
165 print(f"[{source_domain}] {issue_pid}_{index} : Cannot find article pdf link")
166 return None
167 pdf_link = title_link_tag.get("href")
168 if pdf_link is None: 168 ↛ 169line 168 didn't jump to line 169 because the condition on line 168 was never true
169 raise ValueError("Article pdf link is None")
170 if isinstance(pdf_link, list): 170 ↛ 171line 170 didn't jump to line 171 because the condition on line 170 was never true
171 raise ValueError("Article has multiple pdf hrefs")
172 pdf_link = self.source_website + pdf_link
173 add_pdf_link_to_xarticle(xarticle, pdf_link)
175 self.set_pages(xarticle, cleanup_str(pages_tag.text))
176 return xarticle