From f3270ca6ae5d30e4b5766713bf0f3578e13c1353 Mon Sep 17 00:00:00 2001 From: Yi-Chia Chang <61452667+yichiac@users.noreply.github.com> Date: Mon, 12 Feb 2024 17:25:04 -0500 Subject: [PATCH] Resolve NCCM checksum error and add years args (#1870) * add new download links, years Args, and new test data * remove download test file * include all years by default * sort year and verify --- tests/data/nccm/13090442.zip | Bin 3043 -> 0 bytes tests/data/nccm/13090442/CDL2017_clip.tif | Bin 853 -> 0 bytes tests/data/nccm/13090442/CDL2018_clip1.tif | Bin 854 -> 0 bytes tests/data/nccm/13090442/CDL2019_clip.tif | Bin 857 -> 0 bytes tests/data/nccm/CDL2017_clip.tif | Bin 0 -> 977 bytes tests/data/nccm/CDL2018_clip1.tif | Bin 0 -> 973 bytes tests/data/nccm/CDL2019_clip.tif | Bin 0 -> 967 bytes tests/data/nccm/data.py | 17 ++----- tests/datasets/test_nccm.py | 21 +++++--- torchgeo/datasets/nccm.py | 56 ++++++++++++--------- 10 files changed, 50 insertions(+), 44 deletions(-) delete mode 100644 tests/data/nccm/13090442.zip delete mode 100644 tests/data/nccm/13090442/CDL2017_clip.tif delete mode 100644 tests/data/nccm/13090442/CDL2018_clip1.tif delete mode 100644 tests/data/nccm/13090442/CDL2019_clip.tif create mode 100644 tests/data/nccm/CDL2017_clip.tif create mode 100644 tests/data/nccm/CDL2018_clip1.tif create mode 100644 tests/data/nccm/CDL2019_clip.tif diff --git a/tests/data/nccm/13090442.zip b/tests/data/nccm/13090442.zip deleted file mode 100644 index 19d0792078a0042fb635f5a7c30c29df453f5458..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3043 zcmd6pX;jkb9>;OLQ#ejzTAJCSSFK#YeJLU}&D=EuMG;VtL_q@VgyvAg)v3vv#w~Zt zEVE20H4P~%7s4e=(bB}Nj1&|V!(Fe{+|#|CJ8!05%>Oy(|2)r|@Ao_}KELnR$6Y~D z9q{3!V36>Shd+Lh)%`FPN1=kKM<_TtCK5v;VMw6;IBFymDi>LggmskA<`7Zu=6nd*}}dDJR9} z#yrqA8&0VJ~co43tp0`$0z$By|NKRFlQl(Sj8$6=}$optK6UKKV=oUD6+uG;LVySYxl)G+oS zXl~NfoXTGrz+lVp8H|043X)nz>-W;F-;u$*v({VbtmQ-xKpn~@aaKu9E|~2GQ8{b4 z2hV=Xr1Bp3;uY>N<5i{Myna^Rn>6>~k+pE(1WQAzk@1s1>J3-Ds-`i!0EG;w zOkn;VN1b5$9L%Hh9CZe>61r~_8>Tw)_hwfc9`FoUoUX%XOJ2H~40GPG$HVG%N>dVq zf>a~?OAl+6-86hcC$D&6uJbL@HPlrWJ(~JRM`(^=lyg04EH!i)#1My&p+BVb)NdckS|?hmL&qZ4X_4{eT!YGCw3B!q4DI$D68x0UiN))vF1fzCEgop9hc$75X zhkA8sT{lOG-YZ!ahYTg{l^0!*jYRnLdyqB~kzPcMslz%-^F-D;7sIS~&OPnQh_wu( z6hCofTI?h?(;2lQB-iXfR4wIFdzpT7<(*!3m1lJ)3)9imjp_ouNsXO8D&_8gHJaoK zl!->nJEJ{!-{&^Xm*m=XXFB0JZd_+HE!U8u{9ew^!ScQOZsl}G!cQc3m(ER92^9K) zfo&T^lL$KE$UI?ZnL}Y z5tf+*4ejC}LuNqZ*N-kWHw;X^M9%?Luhp8Uc+YNq;%~LqL+U+e{QG0!^`3WAV#fx* z?X8#eFM}M-#}&J4yvZH&$1Pk+eIIlPulm%#j;2x#j_cc!$9J~V#-OQ`m=_^7xXX6~ z;;F#23bgE}i}(Z!x581X*81f7*!^Wnlr7Z-_?GR5rlHX{;*w|E7&xGkp}>8Xelt9u zcyM7%!?CC=I2(1_ij0gXNxBoeGA(NUbq`@Yep<3Lv7_Y*jF(1#JBd%HQ^tAmH?L?w z+thBy#>TEo|(}FhjhSWJAOIrf3tJ6*~8FUc6j$%<2ak0!Yj>ZOooPJhU|__F?+GO{NA{b zBZ5H%9S2liF(+nM&F#?6qKyzyR~x(hgogJ_@h2G zXiz|`T#jjT=p!1&g~|w8V`#RxddTal*m>XrE`b~XFtO&t?cX(0M@;|# diff --git a/tests/data/nccm/13090442/CDL2017_clip.tif b/tests/data/nccm/13090442/CDL2017_clip.tif deleted file mode 100644 index 8dce2bb82e94194da84eb4a0343cc364a4385521..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 853 zcmebD)MDUZU|-qGR53%@tUxv!lnv4&$jAcLTLt8ZB8iJZ z*$O~4;!riWfof!s)YwAVOh7fkNNm1l9tLKh*exJ#YT;pE1Ja*>czruF*o{g+b2hZ| zFo5(W0ofbdnZQ0N0KJ+XbCVt7#W@kFmh~c2g)+Af%$#PoE#h4LGs*Snt`Dsw`7{^k|L0K z4$DF!9K+p%6)a3@7#bKD@}K|UVPvmon14Y01Mk0o%?OWj0K;EK zhWG^%0bIIDJq(K{{4mL2SLk3++Hi8pDq9T(r{#tgTTVIhbRJgyu)?J9H~X9u3ol0Y zeDaiOJ@jzGO67mLtQ}rU4oE(Y+itVkOR4|I<$v3?C;BCtp5&EF<>?An3-GWN^A(tW zRA1!B6t}y^o1$M`Sha#2v1ah3DOrD`(hDHViv-XNA?tsr&iW=3PjgHGAqA_DLshJriDGQZFvF z)ZO5lg{V|&T1(Ut5Tdgd#ME!-1#0yh@=YoX`S4|Olz1DM8(pIgE?7FFkPyF)u zweICBnLVZ-wDT?<^pN^BPh~BW&+kL0lhnC&JxUCYxV-YVXIRH=r1ox4kcigGHS2ai zTe;RiM{I&MUzF5pZEd4j`?g%{sk^l*;MKGB3;C?|dQE-1Wx6(TNVEOcTAggS$Loc& y)bHh|FO@B{yi}afsqE8yZ1K9+Imc$1PA++C;>CT%YOa4*EW=X6i6I5e3lsoz2?TQh diff --git a/tests/data/nccm/13090442/CDL2018_clip1.tif b/tests/data/nccm/13090442/CDL2018_clip1.tif deleted file mode 100644 index 531cd5f4f1f5aa0046c0c81104b3da537f2d14cb..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 854 zcmebD)MDUZU|-qGR53%@tUxv!lnv4&$jAcLTLt8ZB8iJZ z*$O~4;!rhrfNErr)YwAVOh7fkNNm1l9tLKh*exJ#YT;pE1Ja*>czruF*o{g+b2hZ| zFo5(W0ofbdnZQ0N0KJ+XbCVt7#W@kFmh~c2g)+Af%$#PoE#h4LGs*Snt`Dsw`7{^k|L0K z4$DF!9K+p%6)a3@7#bKD^1pvzU}me|{~-T^@E0b=`h$-YI9&V$*%+LY7zBiWurM?+ zJiKuHj}$jM!@*>Oj6Y)5yj@=RFYc&u{maGR@z`Mb3l65gEeuciAIMZS3QY4)`7Dvb z=cwA~^6WyzQX$QtC7~}uGVTP~yH9Y_`gk)&u2H_z=cQ1@<8O0wCU_+{aPBG6bnQQs za&^~UH)Wx=SbcK~m9${?ewSy3GfJfDa~Tto#Ck-p1#%v^qG!w@Q}jP?s(X_7&wphL z1t;B0_<5^lpKVF|?G-|&uUwm7u{CgAX{?O0djIt6%ZfZ|?tS#{{qQKvqgKh4dFJtF zmU(jS$$hJ%6+h1sv3h!YS9!h^XP#elQs4c@mu93?Ust{UN^w?XNzh%EzPz+umPf1- zIy6=~?>VqS~NPm z|2RXYkSdp9ZPqsxgc;^Ouv zy3-nUK4hKL+xZ+Nw>_z>oLoXCxZ~y?E`V0U7 diff --git a/tests/data/nccm/13090442/CDL2019_clip.tif b/tests/data/nccm/13090442/CDL2019_clip.tif deleted file mode 100644 index 67be3087ed77aad718997c0c2d597c301dcf481b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 857 zcmebD)MDUZU|-qGR53%@tUxv!lnv4&$jAcLTLt8ZB8iJZ z*$O~4;!rjBfof!s)YwAVOh7fkNNm1l9tLKh*exJ#YT;pE1Ja*>czruF*o{g+b2hZ| zFo5(W0ofbdnZQ0N0KJ+XbCVt7#W@kFmh~c2g)+Af%$#PoE#h4LGs*Snt`Dsw`7{^k|L0K z4$DF!9K+p%6)a3@7#bKD^gpn0F#K<9xc^}T1Me@U`iA)%0;GR1G1NOOQVW=hSJItdqhPPCMZu=kKuK+_R0{bD5S-ow`3|o90_KJ%+O`Iz}^E zvy3%bT;4WF`=sT@?LYgVkmniOeDU_+Vmb*9|B_)$pW$De1PPjtT0u)I>|$~sQ(6~|Tu-+LOwulUSEKQMOA zs;mPW-redsr)t1`rpa=ZQNSa^mDa23ZBPH2GgBw;(zTaHUB+!!^z|ZFR$B zLi3*MHk_AZHYxee)|h%UV{)m;LW46YZOeIozw2GcF-L5_a-h<$CCjz4+`6|-xYe*> z)5_Uj)-U9gdh~Jg$0%0DEuGy4Z7YMa>e{yToyz*Goj2K;LuJ#dsDH}-yKZb!*!cNY zpHAMboY`wve3IF&`SQ2j1K$f-&lc-qGR53%@tUxv!lnv4&$jAcLTLt8ZB8iJZ z*$O~4;!ri=ObiS%NNQ}MY@jehFcO=unTLTHD0T~on_74n*nspWAYR|j40fXu(5?;b zJPaUxNkI0-b|$b-ihyhoQ0$R=QQISK52GQXxXH*P^86p}#W;NJIDeVth?fIQB_r5p z>1JpFPQXY>zSVyKVS`NXYDw3!2ea~DqgR} z_6{t>H6qrcu%n?|DtPh2^Uzi z1(ll}Gqx}P5)>G=&}rQj4>c~qDUIjif@IbD>Qfod#U(IgpR(?_aOKNJ?kh>|0c-NU zMoHhW>g8G(m9SocW&7rg329F!Sk#48-)st1nh=m;rL4H^)1zn!6?;vcOQ-Y;wYO=D zbQ|($>1{URTr}yL>8YZqoeTN2qNXj};J2{uJvQ`$T=ix+dapXKgaxYom}Sd;N& z)US=}dV^dI?N4aQ9NF_y@Fin|Lh-pX*6XFNEYSbBCt$api1Cum7Cwh>ecEwi^II?L zo-CE`YtDx3XtOAK-Shv#s!P!Zww)yle(v74UG!D+3vC$-KSr5GM?D=sqJWYP8dV*mS1aH^jBmH W>K$Dr5-`!?BvZjl&5J!w4GjQ!s5wCZ literal 0 HcmV?d00001 diff --git a/tests/data/nccm/CDL2018_clip1.tif b/tests/data/nccm/CDL2018_clip1.tif new file mode 100644 index 0000000000000000000000000000000000000000..3313fef10d1de79f2e218ceb3e9fbd9855201f43 GIT binary patch literal 973 zcmebD)MDUZU|-qGR53%@tUxv!lnv4&$jAcLTLt8ZB8iJZ z*$O~4;!rigObiS%NNQ}MY@jehFcO=unTLTHD0T~on_74n*nspWAYR|j40fXu(5?;b zJPaUxNkI0-b|$b-ihyhoQ0$R=QQISK52GQXxXH*P^86p}#W;NJIDeVth?fIQB_r5p z>ZzhIF2%wT`;yZ{4_1Vcshy$L4@n79}! z?GL`!n8Cr!%W&v`{`Vg&G7MGBYWfB|Cm6+B8RQRW?mkwjJ9q!W%bvIYFq$^+X;6@q z*&-`Y?|gva`UK8Dd`le{=Kql3{VmnR>`+?K@=8~_?c^Fmhnl2YpI6*bEW5sBTO{+8 ziw8Gx%(2_?cjNID4xF_s*achMS5G+gbz7}^)3Jk-B>#N!Pp^+%Sd?YJZmhX{|C*xj z(}e68Hr-2}QdMTR?jz&=2j^dw1O@v|%2C|#eA#6oru|oUNWPSFQ`YHrdbjiS)$5sV zH!f_vQ5CB!ZeDrXxbi{C(U%QV5={kSmflnKntL{Hql_5W?(nXoPpr=5rX@N|J+V+h zWACA$DLed=Gajg}Htt(De^pe^w3V4bV#^nKeiAsm&EqiNCe@7Ywu6gL2BoZD((i1s za?^xMc}p3zK3B}O;+*(zq3TI4lQ+gEf6th@;X%@ajholKPU%p26(My}XNA#`lgC$h zf7-T~r~AxBjT15LY##Z^>kg>?PNma+1VfXUPT}d|-3yU)jnVsb-=2_(X zs&Cd3uFXd#nF^f_VL$2NWU6<*w`bWC#f_&o^E93gnel#s&l|g|ny+kjbz5szJ?l{N zQkr&V(HiD?|I~z~c1*PXvMAW!q3f}$$-+#fi_-3#m(&g2Ol7jBuFp7?$+GHX_K%*O zfq(YPUcNjh;;2FA;@v8`o~9{1TbBP$nzcVeHPq#l$1Slo#kadkmMzWB**4dLqiEH% W4GhWO^Fp^u#ob#HdP|IPfdT+A20}Fe literal 0 HcmV?d00001 diff --git a/tests/data/nccm/CDL2019_clip.tif b/tests/data/nccm/CDL2019_clip.tif new file mode 100644 index 0000000000000000000000000000000000000000..9c4d1dcae44475397ef013ec226a6772ace5d515 GIT binary patch literal 967 zcmebD)MDUZU|-qGR53%@tUxv!lnv4&$jAcLTLt8ZB8iJZ z*$O~4;!ridObiS%NNQ}MY@jehFcO=unTLTHD0T~on_74n*nspWAYR|j40fXu(5?;b zJPaUxNkI0-b|$b-ihyhoQ0$R=QQISK52GQXxXH*P^86p}#W;NJIDeVth?fIQB_r5p z>()Zg4;Ue&c#lpO~|XInnrqo*Ta+!v(#`uX!HbcAS=>=<6zq+Iy9 zG$`lq##KpbKN$Egv9+)1zI|t!O-Gc~q-|cWjR50L$7%I@w-*2ePHv zn$$I_H{5jVy(Mx`sq@iNZc|R>*~XfyxmxyY%&|GMy0QQ8w)V3Vuee-M+<3>@Yk8K^ zf<}jxMxPq3S1vm5wm3h^U?yitlmtfJfE5X58VRHl5ukGq&2srZQ-ooClU7e@1s^lJcG!#vlb5d^!*GxT+ NV1{F=g+JsP8UR!XJAwcJ literal 0 HcmV?d00001 diff --git a/tests/data/nccm/data.py b/tests/data/nccm/data.py index 6a98ca3a2d0..2956f147033 100644 --- a/tests/data/nccm/data.py +++ b/tests/data/nccm/data.py @@ -5,7 +5,6 @@ import hashlib import os -import shutil import numpy as np import rasterio @@ -48,20 +47,14 @@ def create_file(path: str, dtype: str): if __name__ == "__main__": - dir = os.path.join(os.getcwd(), "13090442") - - if os.path.exists(dir) and os.path.isdir(dir): - shutil.rmtree(dir) - + dir = os.path.join(os.getcwd()) os.makedirs(dir, exist_ok=True) for file in files: create_file(os.path.join(dir, file), dtype="int8") - # Compress data - shutil.make_archive("13090442", "zip", ".", dir) - # Compute checksums - with open("13090442.zip", "rb") as f: - md5 = hashlib.md5(f.read()).hexdigest() - print(f"13090442.zip: {md5}") + for file in files: + with open(file, "rb") as f: + md5 = hashlib.md5(f.read()).hexdigest() + print(f"{file}: {md5}") diff --git a/tests/datasets/test_nccm.py b/tests/datasets/test_nccm.py index 6637da3e840..0d922d9d3d5 100644 --- a/tests/datasets/test_nccm.py +++ b/tests/datasets/test_nccm.py @@ -25,9 +25,19 @@ class TestNCCM: @pytest.fixture def dataset(self, monkeypatch: MonkeyPatch, tmp_path: Path) -> NCCM: monkeypatch.setattr(torchgeo.datasets.nccm, "download_url", download_url) - url = os.path.join("tests", "data", "nccm", "13090442.zip") + md5s = { + 2017: "ae5c390d0ffb8970d544b8a09142759f", + 2018: "0d453bdb8ea5b7318c33e62513760580", + 2019: "d4ab7ab00bb57623eafb6b27747e5639", + } + monkeypatch.setattr(NCCM, "md5s", md5s) + urls = { + 2017: os.path.join("tests", "data", "nccm", "CDL2017_clip.tif"), + 2018: os.path.join("tests", "data", "nccm", "CDL2018_clip1.tif"), + 2019: os.path.join("tests", "data", "nccm", "CDL2019_clip.tif"), + } + monkeypatch.setattr(NCCM, "urls", urls) transforms = nn.Identity() - monkeypatch.setattr(NCCM, "url", url) root = str(tmp_path) return NCCM(root, transforms=transforms, download=True, checksum=True) @@ -48,11 +58,8 @@ def test_or(self, dataset: NCCM) -> None: def test_already_extracted(self, dataset: NCCM) -> None: NCCM(dataset.paths, download=True) - def test_already_downloaded(self, tmp_path: Path) -> None: - pathname = os.path.join("tests", "data", "nccm", "13090442.zip") - root = str(tmp_path) - shutil.copy(pathname, root) - NCCM(root) + def test_already_downloaded(self, dataset: NCCM) -> None: + NCCM(dataset.paths, download=True) def test_plot(self, dataset: NCCM) -> None: query = dataset.bounds diff --git a/torchgeo/datasets/nccm.py b/torchgeo/datasets/nccm.py index 3a43ddddcc5..38a0d3eee91 100644 --- a/torchgeo/datasets/nccm.py +++ b/torchgeo/datasets/nccm.py @@ -3,8 +3,6 @@ """Northeastern China Crop Map Dataset.""" -import glob -import os from collections.abc import Iterable from typing import Any, Callable, Optional, Union @@ -14,7 +12,7 @@ from rasterio.crs import CRS from .geo import RasterDataset -from .utils import BoundingBox, DatasetNotFoundError, download_url, extract_archive +from .utils import BoundingBox, DatasetNotFoundError, download_url class NCCM(RasterDataset): @@ -55,12 +53,24 @@ class NCCM(RasterDataset): filename_regex = r"CDL(?P\d{4})_clip" filename_glob = "CDL*.*" - zipfile_glob = "13090442.zip" date_format = "%Y" is_image = False - url = "https://figshare.com/ndownloader/articles/13090442/versions/1" - md5 = "eae952f1b346d7e649d027e8139a76f5" + urls = { + 2019: "https://figshare.com/ndownloader/files/25070540", + 2018: "https://figshare.com/ndownloader/files/25070624", + 2017: "https://figshare.com/ndownloader/files/25070582", + } + md5s = { + 2019: "0d062bbd42e483fdc8239d22dba7020f", + 2018: "b3bb4894478d10786aa798fb11693ec1", + 2017: "d047fbe4a85341fa6248fd7e0badab6c", + } + fnames = { + 2019: "CDL2019_clip.tif", + 2018: "CDL2018_clip1.tif", + 2017: "CDL2017_clip.tif", + } cmap = { 0: (0, 255, 0, 255), @@ -75,6 +85,7 @@ def __init__( paths: Union[str, Iterable[str]] = "data", crs: Optional[CRS] = None, res: Optional[float] = None, + years: list[int] = [2019], transforms: Optional[Callable[[dict[str, Any]], dict[str, Any]]] = None, cache: bool = True, download: bool = False, @@ -88,6 +99,7 @@ def __init__( (defaults to the CRS of the first file found) res: resolution of the dataset in units of CRS (defaults to the resolution of the first file found) + years: list of years for which to use nccm layers transforms: a function/transform that takes an input sample and returns a transformed version cache: if True, cache file handle to speed up repeated sampling @@ -97,7 +109,12 @@ def __init__( Raises: DatasetNotFoundError: If dataset is not found and *download* is False. """ + assert set(years) <= self.md5s.keys(), ( + "NCCM data product only exists for the following years: " + f"{list(self.md5s.keys())}." + ) self.paths = paths + self.years = years self.download = download self.checksum = checksum self.ordinal_map = torch.full((max(self.cmap.keys()) + 1,), 4, dtype=self.dtype) @@ -128,37 +145,26 @@ def __getitem__(self, query: BoundingBox) -> dict[str, Any]: def _verify(self) -> None: """Verify the integrity of the dataset.""" - # Check if the extracted files already exist + # Check if the files already exist if self.files: return - # Check if the zip file has already been downloaded - assert isinstance(self.paths, str) - pathname = os.path.join(self.paths, "**", self.zipfile_glob) - if glob.glob(pathname, recursive=True): - self._extract() - return - # Check if the user requested to download the dataset if not self.download: raise DatasetNotFoundError(self) # Download the dataset self._download() - self._extract() def _download(self) -> None: """Download the dataset.""" - filename = "13090442.zip" - download_url( - self.url, self.paths, filename, md5=self.md5 if self.checksum else None - ) - - def _extract(self) -> None: - """Extract the dataset.""" - assert isinstance(self.paths, str) - pathname = os.path.join(self.paths, "**", self.zipfile_glob) - extract_archive(glob.glob(pathname, recursive=True)[0], self.paths) + for year in self.years: + download_url( + self.urls[year], + self.paths, + filename=self.fnames[year], + md5=self.md5s[year] if self.checksum else None, + ) def plot( self,