代码复现记录BSAFusion

癌症影像档案馆下载脚本(TICA)
1
import requests
2
import os
3
import time
4
import json
5
import zipfile
6
from urllib.parse import urlencode
7

8
class TCIADownloader:
9
    def __init__(self):
10
        self.session = requests.Session()
11
        self.session.headers.update({
12
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
13
        })
14

15
    def read_manifest(self, manifest_file):
16
        """读取manifest文件"""
17
        with open(manifest_file, 'r') as f:
18
            lines = f.readlines()
19

20
        config = {}
21
        series_list = []
22

23
        for line in lines:
24
            line = line.strip()
25
            if '=' in line and not line.startswith('1.3.6.1.4.1.14519'):
26
                key, value = line.split('=', 1)
27
                config[key] = value
28
            elif line.startswith('1.3.6.1.4.1.14519'):
29
                series_list.append(line)
30

31
        return config, series_list
32

33
    def try_method_1_nbia_servlet(self, series_uid, download_dir):
34
        """方法1: 使用NBIA servlet"""
35
        print(f"    方法1: NBIA Servlet")
36

37
        url = "https://nbia.cancerimagingarchive.net/nbia-download/servlet/DownloadServlet"
38

39
        param_combinations = [
40
            {'annotation': 'true', 'series': series_uid},
41
            {'includeAnnotation': 'true', 'series': series_uid},
42
            {'seriesInstanceUID': series_uid},
43
            {'SeriesInstanceUID': series_uid, 'format': 'zip'},
44
        ]
45

46
        for i, params in enumerate(param_combinations):
47
            try:
48
                print(f"      尝试参数组合 {i+1}/{len(param_combinations)}")
49

50
                response = self.session.get(url, params=params, stream=True, timeout=120)
51

52
                if response.status_code == 200:
53
                    filename = f"{series_uid}_method1.zip"
54
                    filepath = os.path.join(download_dir, filename)
55

56
                    total_size = int(response.headers.get('content-length', 0))
57

58
                    with open(filepath, 'wb') as f:
59
                        downloaded = 0
60
                        for chunk in response.iter_content(chunk_size=8192):
61
                            if chunk:
62
                                f.write(chunk)
63
                                downloaded += len(chunk)
64

65
                                if total_size > 0:
66
                                    progress = (downloaded / total_size) * 100
67
                                    print(f"\r      下载进度: {progress:.1f}%", end='')
68

69
                    print(f"\r      下载完成: {downloaded} bytes")
70

71
                    if downloaded > 1000:  # 至少1KB
72
                        return True, filepath
73
                    else:
74
                        os.remove(filepath)
75

76
            except Exception as e:
77
                print(f"      错误: {e}")
78
                continue
79

80
        return False, None
81

82
    def try_method_2_tcia_api(self, series_uid, download_dir):
83
        """方法2: 使用TCIA REST API"""
84
        print(f"    方法2: TCIA REST API")
85

86
        url = "https://services.cancerimagingarchive.net/services/v4/TCIA/query/getImage"
87

88
        param_combinations = [
89
            {'SeriesInstanceUID': series_uid, 'format': 'zip'},
90
            {'SeriesInstanceUID': series_uid},
91
            {'series': series_uid, 'format': 'zip'},
92
        ]
93

94
        for i, params in enumerate(param_combinations):
95
            try:
96
                print(f"      尝试API参数组合 {i+1}/{len(param_combinations)}")
97

98
                response = self.session.get(url, params=params, stream=True, timeout=300)
99

100
                if response.status_code == 200:
101
                    filename = f"{series_uid}_method2.zip"
102
                    filepath = os.path.join(download_dir, filename)
103

104
                    total_size = int(response.headers.get('content-length', 0))
105

106
                    with open(filepath, 'wb') as f:
107
                        downloaded = 0
108
                        for chunk in response.iter_content(chunk_size=8192):
109
                            if chunk:
110
                                f.write(chunk)
111
                                downloaded += len(chunk)
112

113
                                if total_size > 0:
114
                                    progress = (downloaded / total_size) * 100
115
                                    print(f"\r      下载进度: {progress:.1f}%", end='')
116

117
                    print(f"\r      下载完成: {downloaded} bytes")
118

119
                    if downloaded > 1000:
120
                        return True, filepath
121
                    else:
122
                        os.remove(filepath)
123

124
                elif response.status_code == 404:
125
                    print(f"      系列未找到 (404)")
126
                    break
127
                elif response.status_code == 401:
128
                    print(f"      需要身份验证 (401)")
129
                    break
130
                else:
131
                    print(f"      HTTP错误: {response.status_code}")
132

133
            except Exception as e:
134
                print(f"      API错误: {e}")
135
                continue
136

137
        return False, None
138

139
    def try_method_3_direct_dicom(self, series_uid, download_dir):
140
        """方法3: 尝试直接DICOM下载"""
141
        print(f"    方法3: 直接DICOM下载")
142

143
        # 首先获取系列中的图像列表
144
        try:
145
            url = "https://services.cancerimagingarchive.net/services/v4/TCIA/query/getSOPInstanceUIDs"
146
            params = {'SeriesInstanceUID': series_uid, 'format': 'json'}
147

148
            response = self.session.get(url, params=params, timeout=60)
149

150
            if response.status_code == 200:
151
                sop_instances = response.json()
152

153
                if sop_instances and len(sop_instances) > 0:
154
                    print(f"      找到 {len(sop_instances)} 个DICOM实例")
155

156
                    # 创建系列目录
157
                    series_dir = os.path.join(download_dir, f"{series_uid}_dicom")
158
                    if not os.path.exists(series_dir):
159
                        os.makedirs(series_dir)
160

161
                    downloaded_count = 0
162

163
                    # 下载前几个实例作为测试
164
                    for i, instance in enumerate(sop_instances[:3]):  # 只下载前3个作为测试
165
                        sop_uid = instance.get('SOPInstanceUID', '')
166
                        if sop_uid:
167
                            success = self.download_single_dicom(series_uid, sop_uid, series_dir)
168
                            if success:
169
                                downloaded_count += 1
170

171
                            if i >= 2:  # 只测试前3个
172
                                break
173

174
                    if downloaded_count > 0:
175
                        print(f"      成功下载 {downloaded_count} 个DICOM文件")
176
                        return True, series_dir
177

178
        except Exception as e:
179
            print(f"      DICOM下载错误: {e}")
180

181
        return False, None
182

183
    def download_single_dicom(self, series_uid, sop_uid, series_dir):
184
        """下载单个DICOM文件"""
185
        try:
186
            url = "https://services.cancerimagingarchive.net/services/v4/TCIA/query/getImage"
187
            params = {
188
                'SeriesInstanceUID': series_uid,
189
                'SOPInstanceUID': sop_uid
190
            }
191

192
            response = self.session.get(url, params=params, stream=True, timeout=120)
193

194
            if response.status_code == 200:
195
                filename = f"{sop_uid}.dcm"
196
                filepath = os.path.join(series_dir, filename)
197

198
                with open(filepath, 'wb') as f:
199
                    for chunk in response.iter_content(chunk_size=8192):
200
                        if chunk:
201
                            f.write(chunk)
202

203
                if os.path.getsize(filepath) > 100:  # 至少100字节
204
                    return True
205
                else:
206
                    os.remove(filepath)
207

208
        except Exception as e:
209
            print(f"        单个DICOM下载错误: {e}")
210

211
        return False
212

213
    def test_download_methods(self, series_list, download_dir, test_count=3):
214
        """测试不同的下载方法"""
215
        print(f"\n测试前 {test_count} 个系列的下载方法...")
216

217
        working_methods = []
218

219
        for i, series_uid in enumerate(series_list[:test_count]):
220
            print(f"\n测试系列 {i+1}/{test_count}: {series_uid}")
221

222
            # 方法1: NBIA Servlet
223
            success, filepath = self.try_method_1_nbia_servlet(series_uid, download_dir)
224
            if success:
225
                working_methods.append(('method1', filepath))
226
                print(f"  ✓ 方法1 成功")
227
                continue
228

229
            # 方法2: TCIA API
230
            success, filepath = self.try_method_2_tcia_api(series_uid, download_dir)
231
            if success:
232
                working_methods.append(('method2', filepath))
233
                print(f"  ✓ 方法2 成功")
234
                continue
235

236
            # 方法3: 直接DICOM
237
            success, filepath = self.try_method_3_direct_dicom(series_uid, download_dir)
238
            if success:
239
                working_methods.append(('method3', filepath))
240
                print(f"  ✓ 方法3 成功")
241
                continue
242

243
            print(f"  ✗ 所有方法都失败")
244

245
        return working_methods
246

247
    def download_all_series(self, series_list, download_dir, working_method):
248
        """使用找到的有效方法下载所有系列"""
249
        print(f"\n使用方法 {working_method} 下载所有 {len(series_list)} 个系列...")
250

251
        successful_downloads = 0
252
        failed_downloads = 0
253

254
        for i, series_uid in enumerate(series_list):
255
            print(f"\n下载系列 {i+1}/{len(series_list)}: {series_uid}")
256

257
            success = False
258

259
            if working_method == 'method1':
260
                success, _ = self.try_method_1_nbia_servlet(series_uid, download_dir)
261
            elif working_method == 'method2':
262
                success, _ = self.try_method_2_tcia_api(series_uid, download_dir)
263
            elif working_method == 'method3':
264
                success, _ = self.try_method_3_direct_dicom(series_uid, download_dir)
265

266
            if success:
267
                successful_downloads += 1
268
                print(f"  ✓ 下载成功")
269
            else:
270
                failed_downloads += 1
271
                print(f"  ✗ 下载失败")
272

273
            # 添加延迟避免服务器过载
274
            time.sleep(2)
275

276
        print(f"\n=== 下载完成统计 ===")
277
        print(f"成功: {successful_downloads}")
278
        print(f"失败: {failed_downloads}")
279
        print(f"总计: {len(series_list)}")
280

281
        return successful_downloads, failed_downloads
282

283
    def extract_zip_files(self, download_dir, extract_dir="tcia_extracted"):
284
        """解压下载的ZIP文件"""
285
        if not os.path.exists(extract_dir):
286
            os.makedirs(extract_dir)
287

288
        zip_files = [f for f in os.listdir(download_dir) if f.endswith('.zip')]
289

290
        if not zip_files:
291
            print("没有找到ZIP文件")
292
            return
293

294
        print(f"\n解压 {len(zip_files)} 个ZIP文件...")
295

296
        for zip_file in zip_files:
297
            zip_path = os.path.join(download_dir, zip_file)
298
            extract_path = os.path.join(extract_dir, zip_file[:-4])
299

300
            try:
301
                with zipfile.ZipFile(zip_path, 'r') as zip_ref:
302
                    zip_ref.extractall(extract_path)
303
                    print(f"✓ 解压完成: {zip_file}")
304
            except Exception as e:
305
                print(f"✗ 解压失败 {zip_file}: {e}")
306

307
def main():
308
    """主函数"""
309
    manifest_file = "Vestibular-Schwannooma-MC-RC manifest August 2023.tcia"
310
    download_dir = "tcia_downloads"
311
    extract_dir = "tcia_extracted"
312

313
    print("TCIA数据下载器 v3.0")
314
    print("=" * 60)
315

316
    # 检查manifest文件
317
    if not os.path.exists(manifest_file):
318
        print(f"错误: 找不到manifest文件: {manifest_file}")
319
        print("请确保manifest文件在当前目录下")
320
        return
321

322
    # 创建下载目录
323
    if not os.path.exists(download_dir):
324
        os.makedirs(download_dir)
325

326
    try:
327
        # 初始化下载器
328
        downloader = TCIADownloader()
329

330
        # 读取manifest文件
331
        config, series_list = downloader.read_manifest(manifest_file)
332
        print(f"从manifest文件读取到 {len(series_list)} 个系列")
333

334
        if not series_list:
335
            print("错误: manifest文件中没有找到系列UID")
336
            return
337

338
        # 测试下载方法
339
        working_methods = downloader.test_download_methods(series_list, download_dir, test_count=3)
340

341
        if not working_methods:
342
            print("\n❌ 所有下载方法都失败了")
343
            print("可能的原因:")
344
            print("1. 网络连接问题")
345
            print("2. TCIA服务器暂时不可用")
346
            print("3. 需要登录TCIA账户")
347
            print("4. 数据集可能已被移除或限制访问")
348
            return
349

350
        print(f"\n✅ 找到 {len(working_methods)} 种有效的下载方法")
351

352
        # 选择最佳方法
353
        best_method = working_methods[0][0]  # 使用第一个成功的方法
354

355
        # 询问是否继续下载所有文件
356
        choice = input(f"\n是否使用找到的方法下载所有 {len(series_list)} 个系列? (y/n): ").lower().strip()
357

358
        if choice == 'y':
359
            successful, failed = downloader.download_all_series(series_list, download_dir, best_method)
360

361
            if successful > 0:
362
          # 询问是否解压文件
363
                extract_choice = input(f"\n成功下载了 {successful} 个文件。是否解压ZIP文件? (y/n): ").lower().strip()
364
                if extract_choice == 'y':
365
                    downloader.extract_zip_files(download_dir, extract_dir)
366
                    print(f"\n文件已解压到: {extract_dir}")
367

368
                print(f"\n✅ 下载完成!")
369
                print(f"📁 下载目录: {download_dir}")
370
                if extract_choice == 'y':
371
                    print(f"📁 解压目录: {extract_dir}")
372
            else:
373
                print("\n❌ 没有成功下载任何文件")
374
        else:
375
            print("下载已取消")
376

377
    except KeyboardInterrupt:
378
        print("\n\n⚠️  下载被用户中断")
379
    except Exception as e:
380
        print(f"\n❌ 发生未知错误: {e}")
381
        import traceback
382
        traceback.print_exc()
383

384
def show_help():
385
    """显示帮助信息"""
386
    help_text = """
387
TCIA数据下载器使用说明:
388

389
1. 准备工作:
390
   - 将TCIA manifest文件保存为 'manifest-1692206474218.tcia'
391
   - 确保网络连接正常
392
   - 安装Python依赖: pip install requests
393

394
2. 运行方式:
395
   python tcia_downloader.py
396

397
3. 下载过程:
398
   - 程序会自动测试多种下载方法
399
   - 找到有效方法后会询问是否继续下载全部文件
400
   - 下载完成后可选择是否解压文件
401

402
4. 输出目录:
403
   - tcia_downloads/     : 下载的原始文件
404
   - tcia_extracted/     : 解压后的DICOM文件
405

406
5. 注意事项:
407
   - 医学影像文件通常很大，确保有足够磁盘空间
408
   - 下载可能需要很长时间，请保持网络连接稳定
409
   - 如果某些文件下载失败，可以重新运行程序（会跳过已下载的文件）
410

411
6. 故障排除:
412
   - 如果所有方法都失败，可能需要先登录TCIA网站
413
   - 检查防火墙设置是否阻止了连接
414
   - 尝试使用VPN或更换网络环境
415
"""
416
    print(help_text)
417

418
def check_dependencies():
419
    """检查依赖项"""
420
    try:
421
        import requests
422
        return True
423
    except ImportError:
424
        print("❌ 缺少依赖项: requests")
425
        print("请运行: pip install requests")
426
        return False
427

428
def check_disk_space(download_dir, estimated_size_gb=50):
429
    """检查磁盘空间"""
430
    try:
431
        import shutil
432
        free_space = shutil.disk_usage(download_dir)[2] / (1024**3)  # GB
433

434
        if free_space < estimated_size_gb:
435
            print(f"⚠️  磁盘空间可能不足")
436
            print(f"可用空间: {free_space:.1f} GB")
437
            print(f"预估需要: {estimated_size_gb} GB")
438
            choice = input("是否继续? (y/n): ").lower().strip()
439
            return choice == 'y'
440
        else:
441
            print(f"✅ 磁盘空间充足: {free_space:.1f} GB")
442
            return True
443
    except:
444
        return True
445

446
def create_download_summary(download_dir, series_list, successful_count, failed_count):
447
    """创建下载摘要文件"""
448
    summary_file = os.path.join(download_dir, "download_summary.txt")
449

450
    try:
451
        with open(summary_file, 'w', encoding='utf-8') as f:
452
            f.write("TCIA数据下载摘要\n")
453
            f.write("=" * 50 + "\n\n")
454
            f.write(f"下载时间: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
455
            f.write(f"总系列数: {len(series_list)}\n")
456
            f.write(f"成功下载: {successful_count}\n")
457
            f.write(f"失败数量: {failed_count}\n")
458
            f.write(f"成功率: {(successful_count/len(series_list)*100):.1f}%\n\n")
459

460
            # 列出下载的文件
461
            downloaded_files = [f for f in os.listdir(download_dir)
462
                              if f.endswith('.zip') or f.endswith('.dcm')]
463

464
            if downloaded_files:
465
                f.write("已下载文件列表:\n")
466
                f.write("-" * 30 + "\n")
467
                for file in sorted(downloaded_files):
468
                    file_path = os.path.join(download_dir, file)
469
                    file_size = os.path.getsize(file_path) / (1024*1024)  # MB
470
                    f.write(f"{file:<50} {file_size:>8.1f} MB\n")
471

472
        print(f"📋 下载摘要已保存到: {summary_file}")
473

474
    except Exception as e:
475
        print(f"⚠️  无法创建下载摘要: {e}")
476

477
if __name__ == "__main__":
478
    import sys
479

480
    # 检查命令行参数
481
    if len(sys.argv) > 1:
482
        if sys.argv[1] == '--help' or sys.argv[1] == '-h':
483
            show_help()
484
            sys.exit(0)
485

486
    # 检查依赖项
487
    if not check_dependencies():
488
        sys.exit(1)
489

490
    # 运行主程序
491
    main()
训练截图#

融合结果#

CT-MRI
PET-MRI
SPECT-MRI
代码复现记录BSAFusion

训练截图#

融合结果#

评论区

目录