Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1436 lines
40 KiB

  1. /*++
  2. Copyright (c) 1999 Microsoft Corporation
  3. Module Name:
  4. buildmdl.c
  5. Abstract:
  6. This module contains the Mm support routines for the cache manager to
  7. prefetching groups of pages from secondary storage using logical file
  8. offets instead of virtual addresses. This saves the cache manager from
  9. having to map pages unnecessarily.
  10. The caller builds a list of various file objects and logical block offsets,
  11. passing them to MmPrefetchPagesIntoLockedMdl. The code here then examines
  12. the internal pages, reading in those that are not already valid or in
  13. transition. These pages are read with a single read, using a dummy page
  14. to bridge gaps of pages that were valid or transition prior to the I/O
  15. being issued.
  16. Upon conclusion of the I/O, control is returned to the calling thread.
  17. All pages are referenced counted as though they were probed and locked,
  18. regardless of whether they are currently valid or transition.
  19. Author:
  20. Landy Wang (landyw) 12-Feb-2001
  21. Revision History:
  22. --*/
  23. #include "mi.h"
  24. #if DBG
  25. ULONG MiCcDebug;
  26. #define MI_CC_FORCE_PREFETCH 0x1 // Trim all user pages to force prefetch
  27. #define MI_CC_DELAY 0x2 // Delay hoping to trigger collisions
  28. #endif
  29. typedef struct _MI_READ_INFO {
  30. PCONTROL_AREA ControlArea;
  31. PFILE_OBJECT FileObject;
  32. LARGE_INTEGER FileOffset;
  33. PMMINPAGE_SUPPORT InPageSupport;
  34. PMDL IoMdl;
  35. PMDL ApiMdl;
  36. PMMPFN DummyPagePfn;
  37. PSUBSECTION FirstReferencedSubsection;
  38. PSUBSECTION LastReferencedSubsection;
  39. SIZE_T LengthInBytes;
  40. } MI_READ_INFO, *PMI_READ_INFO;
  41. VOID
  42. MiCcReleasePrefetchResources (
  43. IN PMI_READ_INFO MiReadInfo,
  44. IN NTSTATUS Status
  45. );
  46. NTSTATUS
  47. MiCcPrepareReadInfo (
  48. IN PMI_READ_INFO MiReadInfo
  49. );
  50. NTSTATUS
  51. MiCcPutPagesInTransition (
  52. IN PMI_READ_INFO MiReadInfo
  53. );
  54. NTSTATUS
  55. MiCcCompletePrefetchIos (
  56. PMI_READ_INFO MiReadInfo
  57. );
  58. VOID
  59. MiRemoveUserPages (
  60. VOID
  61. );
  62. VOID
  63. MiPfFreeDummyPage (
  64. IN PMMPFN DummyPagePfn
  65. );
  66. #ifdef ALLOC_PRAGMA
  67. #pragma alloc_text (PAGE, MmPrefetchPagesIntoLockedMdl)
  68. #pragma alloc_text (PAGE, MiCcPrepareReadInfo)
  69. #pragma alloc_text (PAGE, MiCcReleasePrefetchResources)
  70. #endif
  71. NTSTATUS
  72. MmPrefetchPagesIntoLockedMdl (
  73. IN PFILE_OBJECT FileObject,
  74. IN PLARGE_INTEGER FileOffset,
  75. IN SIZE_T Length,
  76. OUT PMDL *MdlOut
  77. )
  78. /*++
  79. Routine Description:
  80. This routine fills an MDL with pages described by the file object's
  81. offset and length.
  82. This routine is for cache manager usage only.
  83. Arguments:
  84. FileObject - Supplies a pointer to the file object for a file which was
  85. opened with NO_INTERMEDIATE_BUFFERING clear, i.e., for
  86. which CcInitializeCacheMap was called by the file system.
  87. FileOffset - Supplies the byte offset in the file for the desired data.
  88. Length - Supplies the length of the desired data in bytes.
  89. MdlOut - On output it returns a pointer to an Mdl describing
  90. the desired data.
  91. Return Value:
  92. NTSTATUS.
  93. Environment:
  94. Kernel mode. PASSIVE_LEVEL.
  95. --*/
  96. {
  97. MI_READ_INFO MiReadInfo;
  98. NTSTATUS status;
  99. LOGICAL ApcNeeded;
  100. PETHREAD CurrentThread;
  101. ASSERT (KeGetCurrentIrql() == PASSIVE_LEVEL);
  102. RtlZeroMemory (&MiReadInfo, sizeof(MiReadInfo));
  103. MiReadInfo.FileObject = FileObject;
  104. MiReadInfo.FileOffset = *FileOffset;
  105. MiReadInfo.LengthInBytes = Length;
  106. //
  107. // Prepare for the impending read : allocate MDLs, inpage blocks,
  108. // reference count subsections, etc.
  109. //
  110. status = MiCcPrepareReadInfo (&MiReadInfo);
  111. if (!NT_SUCCESS (status)) {
  112. MiCcReleasePrefetchResources (&MiReadInfo, status);
  113. return status;
  114. }
  115. ASSERT (MiReadInfo.InPageSupport != NULL);
  116. //
  117. // APCs must be disabled once we put a page in transition. Otherwise
  118. // a thread suspend will stop us from issuing the I/O - this will hang
  119. // any other threads that need the same page.
  120. //
  121. CurrentThread = PsGetCurrentThread();
  122. ApcNeeded = FALSE;
  123. KeEnterCriticalRegionThread (&CurrentThread->Tcb);
  124. //
  125. // The nested fault count protects this thread from deadlocks where a
  126. // special kernel APC fires and references the same user page(s) we are
  127. // putting in transition.
  128. //
  129. KeEnterGuardedRegionThread (&CurrentThread->Tcb);
  130. ASSERT (CurrentThread->NestedFaultCount == 0);
  131. CurrentThread->NestedFaultCount += 1;
  132. KeLeaveGuardedRegionThread (&CurrentThread->Tcb);
  133. //
  134. // Allocate physical memory, lock down all the pages and issue any
  135. // I/O that may be needed. When MiCcPutPagesInTransition returns
  136. // STATUS_SUCCESS or STATUS_ISSUE_PAGING_IO, it guarantees that the
  137. // ApiMdl contains reference-counted (locked-down) pages.
  138. //
  139. status = MiCcPutPagesInTransition (&MiReadInfo);
  140. if (NT_SUCCESS (status)) {
  141. //
  142. // No I/O was issued because all the pages were already resident and
  143. // have now been locked down.
  144. //
  145. ASSERT (MiReadInfo.ApiMdl != NULL);
  146. }
  147. else if (status == STATUS_ISSUE_PAGING_IO) {
  148. //
  149. // Wait for the I/O to complete. Note APCs must remain disabled.
  150. //
  151. ASSERT (MiReadInfo.InPageSupport != NULL);
  152. status = MiCcCompletePrefetchIos (&MiReadInfo);
  153. }
  154. else {
  155. //
  156. // Some error occurred (like insufficient memory, etc) so fail
  157. // the request by falling through.
  158. //
  159. }
  160. //
  161. // Release acquired resources like pool, subsections, etc.
  162. //
  163. MiCcReleasePrefetchResources (&MiReadInfo, status);
  164. //
  165. // Only now that the I/O have been completed (not just issued) can
  166. // APCs be re-enabled. This prevents a user-issued suspend APC from
  167. // keeping a shared page in transition forever.
  168. //
  169. KeEnterGuardedRegionThread (&CurrentThread->Tcb);
  170. ASSERT (CurrentThread->NestedFaultCount == 1);
  171. CurrentThread->NestedFaultCount -= 1;
  172. if (CurrentThread->ApcNeeded == 1) {
  173. ApcNeeded = TRUE;
  174. CurrentThread->ApcNeeded = 0;
  175. }
  176. KeLeaveGuardedRegionThread (&CurrentThread->Tcb);
  177. KeLeaveCriticalRegionThread (&CurrentThread->Tcb);
  178. ASSERT (KeGetCurrentIrql() == PASSIVE_LEVEL);
  179. ASSERT (CurrentThread->NestedFaultCount == 0);
  180. ASSERT (CurrentThread->ApcNeeded == 0);
  181. if (ApcNeeded == TRUE) {
  182. IoRetryIrpCompletions ();
  183. }
  184. *MdlOut = MiReadInfo.ApiMdl;
  185. return status;
  186. }
  187. VOID
  188. MiCcReleasePrefetchResources (
  189. IN PMI_READ_INFO MiReadInfo,
  190. IN NTSTATUS Status
  191. )
  192. /*++
  193. Routine Description:
  194. This routine releases all resources consumed to handle a system cache
  195. logical offset based prefetch.
  196. Environment:
  197. Kernel mode, PASSIVE_LEVEL.
  198. --*/
  199. {
  200. PSUBSECTION FirstReferencedSubsection;
  201. PSUBSECTION LastReferencedSubsection;
  202. //
  203. // Release all subsection prototype PTE references.
  204. //
  205. FirstReferencedSubsection = MiReadInfo->FirstReferencedSubsection;
  206. LastReferencedSubsection = MiReadInfo->LastReferencedSubsection;
  207. while (FirstReferencedSubsection != LastReferencedSubsection) {
  208. MiRemoveViewsFromSectionWithPfn ((PMSUBSECTION) FirstReferencedSubsection,
  209. FirstReferencedSubsection->PtesInSubsection);
  210. FirstReferencedSubsection = FirstReferencedSubsection->NextSubsection;
  211. }
  212. if (MiReadInfo->IoMdl != NULL) {
  213. ExFreePool (MiReadInfo->IoMdl);
  214. }
  215. //
  216. // Note successful returns yield the ApiMdl so don't free it here.
  217. //
  218. if (!NT_SUCCESS (Status)) {
  219. if (MiReadInfo->ApiMdl != NULL) {
  220. ExFreePool (MiReadInfo->ApiMdl);
  221. }
  222. }
  223. if (MiReadInfo->InPageSupport != NULL) {
  224. #if DBG
  225. MiReadInfo->InPageSupport->ListEntry.Next = NULL;
  226. #endif
  227. MiFreeInPageSupportBlock (MiReadInfo->InPageSupport);
  228. }
  229. //
  230. // Put DummyPage back on the free list.
  231. //
  232. if (MiReadInfo->DummyPagePfn != NULL) {
  233. MiPfFreeDummyPage (MiReadInfo->DummyPagePfn);
  234. }
  235. }
  236. NTSTATUS
  237. MiCcPrepareReadInfo (
  238. IN PMI_READ_INFO MiReadInfo
  239. )
  240. /*++
  241. Routine Description:
  242. This routine constructs MDLs that describe the pages in the argument
  243. read-list. The caller will then issue the I/O on return.
  244. Arguments:
  245. MiReadInfo - Supplies a pointer to the read-list.
  246. Return Value:
  247. Various NTSTATUS codes.
  248. Environment:
  249. Kernel mode, PASSIVE_LEVEL.
  250. --*/
  251. {
  252. UINT64 PteOffset;
  253. NTSTATUS Status;
  254. PMMPTE ProtoPte;
  255. PMMPTE LastProto;
  256. PMMPTE *ProtoPteArray;
  257. PCONTROL_AREA ControlArea;
  258. PSUBSECTION Subsection;
  259. PMMINPAGE_SUPPORT InPageSupport;
  260. PMDL Mdl;
  261. PMDL IoMdl;
  262. PMDL ApiMdl;
  263. ULONG i;
  264. PFN_NUMBER NumberOfPages;
  265. ASSERT (KeGetCurrentIrql() == PASSIVE_LEVEL);
  266. NumberOfPages = ADDRESS_AND_SIZE_TO_SPAN_PAGES (MiReadInfo->FileOffset.LowPart, MiReadInfo->LengthInBytes);
  267. //
  268. // Translate the section object into the relevant control area.
  269. //
  270. ControlArea = (PCONTROL_AREA)MiReadInfo->FileObject->SectionObjectPointer->DataSectionObject;
  271. //
  272. // If the section is backed by a ROM, then there's no need to prefetch
  273. // anything as it would waste RAM.
  274. //
  275. if (ControlArea->u.Flags.Rom == 1) {
  276. ASSERT (XIPConfigured == TRUE);
  277. return STATUS_NOT_SUPPORTED;
  278. }
  279. //
  280. // Initialize the internal Mi readlist.
  281. //
  282. MiReadInfo->ControlArea = ControlArea;
  283. //
  284. // Allocate and initialize an inpage support block for this run.
  285. //
  286. InPageSupport = MiGetInPageSupportBlock (MM_NOIRQL, &Status);
  287. if (InPageSupport == NULL) {
  288. ASSERT (!NT_SUCCESS (Status));
  289. return Status;
  290. }
  291. MiReadInfo->InPageSupport = InPageSupport;
  292. //
  293. // Allocate and initialize an MDL to return to our caller. The actual
  294. // frame numbers are filled in when all the pages are reference counted.
  295. //
  296. ApiMdl = MmCreateMdl (NULL, NULL, NumberOfPages << PAGE_SHIFT);
  297. if (ApiMdl == NULL) {
  298. return STATUS_INSUFFICIENT_RESOURCES;
  299. }
  300. ApiMdl->MdlFlags |= MDL_PAGES_LOCKED;
  301. MiReadInfo->ApiMdl = ApiMdl;
  302. //
  303. // Allocate and initialize an MDL to use for the actual transfer (if any).
  304. //
  305. IoMdl = MmCreateMdl (NULL, NULL, NumberOfPages << PAGE_SHIFT);
  306. if (IoMdl == NULL) {
  307. return STATUS_INSUFFICIENT_RESOURCES;
  308. }
  309. MiReadInfo->IoMdl = IoMdl;
  310. Mdl = IoMdl;
  311. //
  312. // Make sure the section is really prefetchable - physical and
  313. // pagefile-backed sections are not.
  314. //
  315. if ((ControlArea->u.Flags.PhysicalMemory) ||
  316. (ControlArea->u.Flags.Image == 1) ||
  317. (ControlArea->FilePointer == NULL)) {
  318. return STATUS_INVALID_PARAMETER_1;
  319. }
  320. //
  321. // Start the read at the proper file offset.
  322. //
  323. InPageSupport->ReadOffset = MiReadInfo->FileOffset;
  324. ASSERT (BYTE_OFFSET (InPageSupport->ReadOffset.LowPart) == 0);
  325. InPageSupport->FilePointer = MiReadInfo->FileObject;
  326. //
  327. // Stash a pointer to the start of the prototype PTE array (the values
  328. // in the array are not contiguous as they may cross subsections)
  329. // in the inpage block so we can walk it quickly later when the pages
  330. // are put into transition.
  331. //
  332. ProtoPteArray = (PMMPTE *)(Mdl + 1);
  333. InPageSupport->BasePte = (PMMPTE) ProtoPteArray;
  334. //
  335. // Data (but not image) reads use the whole page and the filesystems
  336. // zero fill any remainder beyond valid data length so we don't
  337. // bother to handle this here. It is important to specify the
  338. // entire page where possible so the filesystem won't post this
  339. // which will hurt perf. LWFIX: must use CcZero to make this true.
  340. //
  341. ASSERT (((ULONG_PTR)Mdl & (sizeof(QUAD) - 1)) == 0);
  342. InPageSupport->u1.e1.PrefetchMdlHighBits = ((ULONG_PTR)Mdl >> 3);
  343. //
  344. // Initialize the prototype PTE pointers.
  345. //
  346. ASSERT (ControlArea->u.Flags.GlobalOnlyPerSession == 0);
  347. if (ControlArea->u.Flags.Rom == 0) {
  348. Subsection = (PSUBSECTION)(ControlArea + 1);
  349. }
  350. else {
  351. Subsection = (PSUBSECTION)((PLARGE_CONTROL_AREA)ControlArea + 1);
  352. }
  353. #if DBG
  354. if (MiCcDebug & MI_CC_FORCE_PREFETCH) {
  355. MiRemoveUserPages ();
  356. }
  357. #endif
  358. //
  359. // Calculate the first prototype PTE address.
  360. //
  361. PteOffset = (UINT64)(MiReadInfo->FileOffset.QuadPart >> PAGE_SHIFT);
  362. //
  363. // Make sure the PTEs are not in the extended part of the segment.
  364. //
  365. while (TRUE) {
  366. //
  367. // A memory barrier is needed to read the subsection chains
  368. // in order to ensure the writes to the actual individual
  369. // subsection data structure fields are visible in correct
  370. // order. This avoids the need to acquire any stronger
  371. // synchronization (ie: PFN lock), thus yielding better
  372. // performance and pagability.
  373. //
  374. KeMemoryBarrier ();
  375. if (PteOffset < (UINT64) Subsection->PtesInSubsection) {
  376. break;
  377. }
  378. PteOffset -= Subsection->PtesInSubsection;
  379. Subsection = Subsection->NextSubsection;
  380. }
  381. Status = MiAddViewsForSectionWithPfn ((PMSUBSECTION) Subsection,
  382. Subsection->PtesInSubsection);
  383. if (!NT_SUCCESS (Status)) {
  384. return Status;
  385. }
  386. MiReadInfo->FirstReferencedSubsection = Subsection;
  387. MiReadInfo->LastReferencedSubsection = Subsection;
  388. ProtoPte = &Subsection->SubsectionBase[PteOffset];
  389. LastProto = &Subsection->SubsectionBase[Subsection->PtesInSubsection];
  390. for (i = 0; i < NumberOfPages; i += 1) {
  391. //
  392. // Calculate which PTE maps the given logical block offset.
  393. //
  394. // Always look forwards (as an optimization) in the subsection chain.
  395. //
  396. // A quick check is made first to avoid recalculations and loops where
  397. // possible.
  398. //
  399. if (ProtoPte >= LastProto) {
  400. //
  401. // Handle extended subsections. Increment the view count for
  402. // every subsection spanned by this request, creating prototype
  403. // PTEs if needed.
  404. //
  405. ASSERT (i != 0);
  406. Subsection = Subsection->NextSubsection;
  407. Status = MiAddViewsForSectionWithPfn ((PMSUBSECTION) Subsection,
  408. Subsection->PtesInSubsection);
  409. if (!NT_SUCCESS (Status)) {
  410. return Status;
  411. }
  412. MiReadInfo->LastReferencedSubsection = Subsection;
  413. ProtoPte = Subsection->SubsectionBase;
  414. LastProto = &Subsection->SubsectionBase[Subsection->PtesInSubsection];
  415. }
  416. *ProtoPteArray = ProtoPte;
  417. ProtoPteArray += 1;
  418. ProtoPte += 1;
  419. }
  420. return STATUS_SUCCESS;
  421. }
  422. NTSTATUS
  423. MiCcPutPagesInTransition (
  424. IN PMI_READ_INFO MiReadInfo
  425. )
  426. /*++
  427. Routine Description:
  428. This routine allocates physical memory for the specified read-list and
  429. puts all the pages in transition (so collided faults from other threads
  430. for these same pages remain coherent). I/O for any pages not already
  431. resident are issued here. The caller must wait for their completion.
  432. Arguments:
  433. MiReadInfo - Supplies a pointer to the read-list.
  434. Return Value:
  435. STATUS_SUCCESS - all the pages were already resident, reference counts
  436. have been applied and no I/O needs to be waited for.
  437. STATUS_ISSUE_PAGING_IO - the I/O has been issued and the caller must wait.
  438. Various other failure status values indicate the operation failed.
  439. Environment:
  440. Kernel mode. PASSIVE_LEVEL.
  441. --*/
  442. {
  443. NTSTATUS status;
  444. PMMPTE LocalPrototypePte;
  445. PVOID StartingVa;
  446. PFN_NUMBER MdlPages;
  447. KIRQL OldIrql;
  448. MMPTE PteContents;
  449. PFN_NUMBER PageFrameIndex;
  450. PFN_NUMBER ResidentAvailableCharge;
  451. PPFN_NUMBER IoPage;
  452. PPFN_NUMBER ApiPage;
  453. PPFN_NUMBER Page;
  454. PPFN_NUMBER DestinationPage;
  455. ULONG PageColor;
  456. PMMPTE PointerPte;
  457. PMMPTE *ProtoPteArray;
  458. PMMPTE *EndProtoPteArray;
  459. PFN_NUMBER DummyPage;
  460. PMDL Mdl;
  461. PMDL FreeMdl;
  462. PMMPFN PfnProto;
  463. PMMPFN Pfn1;
  464. PMMPFN DummyPfn1;
  465. ULONG i;
  466. PFN_NUMBER DummyTrim;
  467. ULONG NumberOfPagesNeedingIo;
  468. MMPTE TempPte;
  469. PMMPTE PointerPde;
  470. PEPROCESS CurrentProcess;
  471. PMMINPAGE_SUPPORT InPageSupport;
  472. ASSERT (KeGetCurrentIrql() == PASSIVE_LEVEL);
  473. MiReadInfo->DummyPagePfn = NULL;
  474. FreeMdl = NULL;
  475. CurrentProcess = PsGetCurrentProcess();
  476. PfnProto = NULL;
  477. PointerPde = NULL;
  478. InPageSupport = MiReadInfo->InPageSupport;
  479. Mdl = MI_EXTRACT_PREFETCH_MDL (InPageSupport);
  480. ASSERT (Mdl == MiReadInfo->IoMdl);
  481. IoPage = (PPFN_NUMBER)(Mdl + 1);
  482. ApiPage = (PPFN_NUMBER)(MiReadInfo->ApiMdl + 1);
  483. StartingVa = (PVOID)((PCHAR)Mdl->StartVa + Mdl->ByteOffset);
  484. MdlPages = ADDRESS_AND_SIZE_TO_SPAN_PAGES (StartingVa,
  485. Mdl->ByteCount);
  486. if (MdlPages + 1 > MAXUSHORT) {
  487. //
  488. // The PFN ReferenceCount for the dummy page could wrap, refuse the
  489. // request.
  490. //
  491. return STATUS_INSUFFICIENT_RESOURCES;
  492. }
  493. NumberOfPagesNeedingIo = 0;
  494. ProtoPteArray = (PMMPTE *)InPageSupport->BasePte;
  495. EndProtoPteArray = ProtoPteArray + MdlPages;
  496. ASSERT (*ProtoPteArray != NULL);
  497. LOCK_PFN (OldIrql);
  498. //
  499. // Ensure sufficient pages exist for the transfer plus the dummy page.
  500. //
  501. if (((SPFN_NUMBER)MdlPages > (SPFN_NUMBER)(MmAvailablePages - MM_HIGH_LIMIT)) ||
  502. (MI_NONPAGABLE_MEMORY_AVAILABLE() <= (SPFN_NUMBER)MdlPages)) {
  503. UNLOCK_PFN (OldIrql);
  504. return STATUS_INSUFFICIENT_RESOURCES;
  505. }
  506. //
  507. // Charge resident available immediately as the PFN lock may get released
  508. // and reacquired below before all the pages have been locked down.
  509. // Note the dummy page is immediately charged separately.
  510. //
  511. MI_DECREMENT_RESIDENT_AVAILABLE (MdlPages, MM_RESAVAIL_ALLOCATE_BUILDMDL);
  512. ResidentAvailableCharge = MdlPages;
  513. //
  514. // Allocate a dummy page to map discarded pages that aren't skipped.
  515. //
  516. DummyPage = MiRemoveAnyPage (0);
  517. Pfn1 = MI_PFN_ELEMENT (DummyPage);
  518. ASSERT (Pfn1->u2.ShareCount == 0);
  519. ASSERT (Pfn1->u3.e2.ReferenceCount == 0);
  520. MiInitializePfnForOtherProcess (DummyPage, MI_PF_DUMMY_PAGE_PTE, 0);
  521. //
  522. // Always bias the reference count by 1 and charge for this locked page
  523. // up front so the myriad increments and decrements don't get slowed
  524. // down with needless checking.
  525. //
  526. Pfn1->u3.e1.PrototypePte = 0;
  527. MI_ADD_LOCKED_PAGE_CHARGE(Pfn1, TRUE, 42);
  528. Pfn1->u3.e2.ReferenceCount += 1;
  529. Pfn1->u3.e1.ReadInProgress = 1;
  530. MiReadInfo->DummyPagePfn = Pfn1;
  531. DummyPfn1 = Pfn1;
  532. DummyPfn1->u3.e2.ReferenceCount =
  533. (USHORT)(DummyPfn1->u3.e2.ReferenceCount + MdlPages);
  534. //
  535. // Properly initialize the inpage support block fields we overloaded.
  536. //
  537. InPageSupport->BasePte = *ProtoPteArray;
  538. //
  539. // Build the proper InPageSupport and MDL to describe this run.
  540. //
  541. for (; ProtoPteArray < EndProtoPteArray; ProtoPteArray += 1, IoPage += 1, ApiPage += 1) {
  542. //
  543. // Fill the MDL entry for this RLE.
  544. //
  545. PointerPte = *ProtoPteArray;
  546. ASSERT (PointerPte != NULL);
  547. //
  548. // The PointerPte better be inside a prototype PTE allocation
  549. // so that subsequent page trims update the correct PTEs.
  550. //
  551. ASSERT (((PointerPte >= (PMMPTE)MmPagedPoolStart) &&
  552. (PointerPte <= (PMMPTE)MmPagedPoolEnd)) ||
  553. ((PointerPte >= (PMMPTE)MmSpecialPoolStart) && (PointerPte <= (PMMPTE)MmSpecialPoolEnd)));
  554. //
  555. // Check the state of this prototype PTE now that the PFN lock is held.
  556. // If the page is not resident, the PTE must be put in transition with
  557. // read in progress before the PFN lock is released.
  558. //
  559. //
  560. // Lock page containing prototype PTEs in memory by
  561. // incrementing the reference count for the page.
  562. // Unlock any page locked earlier containing prototype PTEs if
  563. // the containing page is not the same for both.
  564. //
  565. if (PfnProto != NULL) {
  566. if (PointerPde != MiGetPteAddress (PointerPte)) {
  567. ASSERT (PfnProto->u3.e2.ReferenceCount > 1);
  568. MI_REMOVE_LOCKED_PAGE_CHARGE_AND_DECREF(PfnProto, 43);
  569. PfnProto = NULL;
  570. }
  571. }
  572. if (PfnProto == NULL) {
  573. ASSERT (!MI_IS_PHYSICAL_ADDRESS (PointerPte));
  574. PointerPde = MiGetPteAddress (PointerPte);
  575. if (PointerPde->u.Hard.Valid == 0) {
  576. MiMakeSystemAddressValidPfn (PointerPte, OldIrql);
  577. }
  578. PfnProto = MI_PFN_ELEMENT (PointerPde->u.Hard.PageFrameNumber);
  579. MI_ADD_LOCKED_PAGE_CHARGE(PfnProto, TRUE, 44);
  580. PfnProto->u3.e2.ReferenceCount += 1;
  581. ASSERT (PfnProto->u3.e2.ReferenceCount > 1);
  582. }
  583. recheck:
  584. PteContents = *PointerPte;
  585. // LWFIX: are zero or dzero ptes possible here ?
  586. ASSERT (PteContents.u.Long != ZeroKernelPte.u.Long);
  587. if (PteContents.u.Hard.Valid == 1) {
  588. PageFrameIndex = MI_GET_PAGE_FRAME_FROM_PTE (&PteContents);
  589. Pfn1 = MI_PFN_ELEMENT (PageFrameIndex);
  590. ASSERT (Pfn1->u3.e1.PrototypePte == 1);
  591. MI_ADD_LOCKED_PAGE_CHARGE(Pfn1, TRUE, 45);
  592. Pfn1->u3.e2.ReferenceCount += 1;
  593. *ApiPage = PageFrameIndex;
  594. *IoPage = DummyPage;
  595. continue;
  596. }
  597. if ((PteContents.u.Soft.Prototype == 0) &&
  598. (PteContents.u.Soft.Transition == 1)) {
  599. //
  600. // The page is in transition. If there is an inpage still in
  601. // progress, wait for it to complete. Reference the PFN and
  602. // then march on.
  603. //
  604. PageFrameIndex = MI_GET_PAGE_FRAME_FROM_TRANSITION_PTE (&PteContents);
  605. Pfn1 = MI_PFN_ELEMENT (PageFrameIndex);
  606. ASSERT (Pfn1->u3.e1.PrototypePte == 1);
  607. if (Pfn1->u4.InPageError) {
  608. //
  609. // There was an in-page read error and there are other
  610. // threads colliding for this page, delay to let the
  611. // other threads complete and then retry.
  612. //
  613. UNLOCK_PFN (OldIrql);
  614. KeDelayExecutionThread (KernelMode, FALSE, (PLARGE_INTEGER)&MmHalfSecond);
  615. LOCK_PFN (OldIrql);
  616. goto recheck;
  617. }
  618. if (Pfn1->u3.e1.ReadInProgress) {
  619. // LWFIX - start with temp\aw.c
  620. }
  621. //
  622. // PTE refers to a normal transition PTE.
  623. //
  624. ASSERT ((SPFN_NUMBER)MmAvailablePages >= 0);
  625. if (MmAvailablePages == 0) {
  626. //
  627. // This can only happen if the system is utilizing a hardware
  628. // compression cache. This ensures that only a safe amount
  629. // of the compressed virtual cache is directly mapped so that
  630. // if the hardware gets into trouble, we can bail it out.
  631. //
  632. UNLOCK_PFN (OldIrql);
  633. KeDelayExecutionThread (KernelMode, FALSE, (PLARGE_INTEGER)&MmHalfSecond);
  634. LOCK_PFN (OldIrql);
  635. goto recheck;
  636. }
  637. //
  638. // The PFN reference count will be 1 already here if the
  639. // modified writer has begun a write of this page. Otherwise
  640. // it's ordinarily 0.
  641. //
  642. MI_ADD_LOCKED_PAGE_CHARGE_FOR_MODIFIED_PAGE (Pfn1, TRUE, 46);
  643. Pfn1->u3.e2.ReferenceCount += 1;
  644. *IoPage = DummyPage;
  645. *ApiPage = PageFrameIndex;
  646. continue;
  647. }
  648. ASSERT (PteContents.u.Soft.Prototype == 1);
  649. if ((MmAvailablePages < MM_HIGH_LIMIT) &&
  650. (MiEnsureAvailablePageOrWait (NULL, NULL, OldIrql))) {
  651. //
  652. // Had to wait so recheck all state.
  653. //
  654. goto recheck;
  655. }
  656. NumberOfPagesNeedingIo += 1;
  657. //
  658. // Allocate a physical page.
  659. //
  660. PageColor = MI_PAGE_COLOR_VA_PROCESS (
  661. MiGetVirtualAddressMappedByPte (PointerPte),
  662. &CurrentProcess->NextPageColor);
  663. PageFrameIndex = MiRemoveAnyPage (PageColor);
  664. Pfn1 = MI_PFN_ELEMENT (PageFrameIndex);
  665. ASSERT (Pfn1->u3.e2.ReferenceCount == 0);
  666. ASSERT (Pfn1->u2.ShareCount == 0);
  667. ASSERT (PointerPte->u.Hard.Valid == 0);
  668. //
  669. // Initialize read-in-progress PFN.
  670. //
  671. MiInitializePfn (PageFrameIndex, PointerPte, 0);
  672. //
  673. // These pieces of MiInitializePfn initialization are overridden
  674. // here as these pages are only going into prototype
  675. // transition and not into any page tables.
  676. //
  677. Pfn1->u3.e1.PrototypePte = 1;
  678. MI_ADD_LOCKED_PAGE_CHARGE(Pfn1, TRUE, 47);
  679. Pfn1->u2.ShareCount -= 1;
  680. Pfn1->u3.e1.PageLocation = ZeroedPageList;
  681. //
  682. // Initialize the I/O specific fields.
  683. //
  684. Pfn1->u1.Event = &InPageSupport->Event;
  685. Pfn1->u3.e1.ReadInProgress = 1;
  686. ASSERT (Pfn1->u4.InPageError == 0);
  687. //
  688. // Increment the PFN reference count in the control area for
  689. // the subsection.
  690. //
  691. MiReadInfo->ControlArea->NumberOfPfnReferences += 1;
  692. //
  693. // Put the prototype PTE into the transition state.
  694. //
  695. MI_MAKE_TRANSITION_PTE (TempPte,
  696. PageFrameIndex,
  697. PointerPte->u.Soft.Protection,
  698. PointerPte);
  699. MI_WRITE_INVALID_PTE (PointerPte, TempPte);
  700. *IoPage = PageFrameIndex;
  701. *ApiPage = PageFrameIndex;
  702. }
  703. //
  704. // If all the pages were resident, dereference the dummy page references
  705. // now and notify our caller that I/O is not necessary.
  706. //
  707. if (NumberOfPagesNeedingIo == 0) {
  708. ASSERT (DummyPfn1->u3.e2.ReferenceCount > MdlPages);
  709. DummyPfn1->u3.e2.ReferenceCount =
  710. (USHORT)(DummyPfn1->u3.e2.ReferenceCount - MdlPages);
  711. //
  712. // Unlock page containing prototype PTEs.
  713. //
  714. if (PfnProto != NULL) {
  715. ASSERT (PfnProto->u3.e2.ReferenceCount > 1);
  716. MI_REMOVE_LOCKED_PAGE_CHARGE_AND_DECREF(PfnProto, 48);
  717. }
  718. UNLOCK_PFN (OldIrql);
  719. //
  720. // Return the upfront resident available charge as the
  721. // individual charges have all been made at this point.
  722. //
  723. MI_INCREMENT_RESIDENT_AVAILABLE (ResidentAvailableCharge,
  724. MM_RESAVAIL_FREE_BUILDMDL_EXCESS);
  725. return STATUS_SUCCESS;
  726. }
  727. //
  728. // Carefully trim leading dummy pages.
  729. //
  730. Page = (PPFN_NUMBER)(Mdl + 1);
  731. DummyTrim = 0;
  732. for (i = 0; i < MdlPages - 1; i += 1) {
  733. if (*Page == DummyPage) {
  734. DummyTrim += 1;
  735. Page += 1;
  736. }
  737. else {
  738. break;
  739. }
  740. }
  741. if (DummyTrim != 0) {
  742. Mdl->Size = (USHORT)(Mdl->Size - (DummyTrim * sizeof(PFN_NUMBER)));
  743. Mdl->ByteCount -= (ULONG)(DummyTrim * PAGE_SIZE);
  744. ASSERT (Mdl->ByteCount != 0);
  745. InPageSupport->ReadOffset.QuadPart += (DummyTrim * PAGE_SIZE);
  746. DummyPfn1->u3.e2.ReferenceCount =
  747. (USHORT)(DummyPfn1->u3.e2.ReferenceCount - DummyTrim);
  748. //
  749. // Shuffle down the PFNs in the MDL.
  750. // Recalculate BasePte to adjust for the shuffle.
  751. //
  752. Pfn1 = MI_PFN_ELEMENT (*Page);
  753. ASSERT (Pfn1->PteAddress->u.Hard.Valid == 0);
  754. ASSERT ((Pfn1->PteAddress->u.Soft.Prototype == 0) &&
  755. (Pfn1->PteAddress->u.Soft.Transition == 1));
  756. InPageSupport->BasePte = Pfn1->PteAddress;
  757. DestinationPage = (PPFN_NUMBER)(Mdl + 1);
  758. do {
  759. *DestinationPage = *Page;
  760. DestinationPage += 1;
  761. Page += 1;
  762. i += 1;
  763. } while (i < MdlPages);
  764. MdlPages -= DummyTrim;
  765. }
  766. //
  767. // Carefully trim trailing dummy pages.
  768. //
  769. ASSERT (MdlPages != 0);
  770. Page = (PPFN_NUMBER)(Mdl + 1) + MdlPages - 1;
  771. if (*Page == DummyPage) {
  772. ASSERT (MdlPages >= 2);
  773. //
  774. // Trim the last page specially as it may be a partial page.
  775. //
  776. Mdl->Size -= sizeof(PFN_NUMBER);
  777. if (BYTE_OFFSET(Mdl->ByteCount) != 0) {
  778. Mdl->ByteCount &= ~(PAGE_SIZE - 1);
  779. }
  780. else {
  781. Mdl->ByteCount -= PAGE_SIZE;
  782. }
  783. ASSERT (Mdl->ByteCount != 0);
  784. DummyPfn1->u3.e2.ReferenceCount -= 1;
  785. //
  786. // Now trim any other trailing pages.
  787. //
  788. Page -= 1;
  789. DummyTrim = 0;
  790. while (Page != ((PPFN_NUMBER)(Mdl + 1))) {
  791. if (*Page != DummyPage) {
  792. break;
  793. }
  794. DummyTrim += 1;
  795. Page -= 1;
  796. }
  797. if (DummyTrim != 0) {
  798. ASSERT (Mdl->Size > (USHORT)(DummyTrim * sizeof(PFN_NUMBER)));
  799. Mdl->Size = (USHORT)(Mdl->Size - (DummyTrim * sizeof(PFN_NUMBER)));
  800. Mdl->ByteCount -= (ULONG)(DummyTrim * PAGE_SIZE);
  801. DummyPfn1->u3.e2.ReferenceCount =
  802. (USHORT)(DummyPfn1->u3.e2.ReferenceCount - DummyTrim);
  803. }
  804. ASSERT (MdlPages > DummyTrim + 1);
  805. MdlPages -= (DummyTrim + 1);
  806. #if DBG
  807. StartingVa = (PVOID)((PCHAR)Mdl->StartVa + Mdl->ByteOffset);
  808. ASSERT (MdlPages == ADDRESS_AND_SIZE_TO_SPAN_PAGES(StartingVa,
  809. Mdl->ByteCount));
  810. #endif
  811. }
  812. //
  813. // If the MDL is not already embedded in the inpage block, see if its
  814. // final size qualifies it - if so, embed it now.
  815. //
  816. if ((Mdl != &InPageSupport->Mdl) &&
  817. (Mdl->ByteCount <= (MM_MAXIMUM_READ_CLUSTER_SIZE + 1) * PAGE_SIZE)){
  818. #if DBG
  819. RtlFillMemoryUlong (&InPageSupport->Page[0],
  820. (MM_MAXIMUM_READ_CLUSTER_SIZE+1) * sizeof (PFN_NUMBER),
  821. 0xf1f1f1f1);
  822. #endif
  823. RtlCopyMemory (&InPageSupport->Mdl, Mdl, Mdl->Size);
  824. FreeMdl = Mdl;
  825. Mdl = &InPageSupport->Mdl;
  826. ASSERT (((ULONG_PTR)Mdl & (sizeof(QUAD) - 1)) == 0);
  827. InPageSupport->u1.e1.PrefetchMdlHighBits = ((ULONG_PTR)Mdl >> 3);
  828. }
  829. ASSERT (MdlPages != 0);
  830. ASSERT (Mdl->Size - sizeof(MDL) == BYTES_TO_PAGES(Mdl->ByteCount) * sizeof(PFN_NUMBER));
  831. DummyPfn1->u3.e2.ReferenceCount =
  832. (USHORT)(DummyPfn1->u3.e2.ReferenceCount - NumberOfPagesNeedingIo);
  833. //
  834. // Unlock page containing prototype PTEs.
  835. //
  836. if (PfnProto != NULL) {
  837. ASSERT (PfnProto->u3.e2.ReferenceCount > 1);
  838. MI_REMOVE_LOCKED_PAGE_CHARGE_AND_DECREF(PfnProto, 49);
  839. }
  840. UNLOCK_PFN (OldIrql);
  841. InterlockedIncrement ((PLONG) &MmInfoCounters.PageReadIoCount);
  842. InterlockedExchangeAdd ((PLONG) &MmInfoCounters.PageReadCount,
  843. (LONG) NumberOfPagesNeedingIo);
  844. //
  845. // Return the upfront resident available charge as the
  846. // individual charges have all been made at this point.
  847. //
  848. MI_INCREMENT_RESIDENT_AVAILABLE (ResidentAvailableCharge,
  849. MM_RESAVAIL_FREE_BUILDMDL_EXCESS);
  850. if (FreeMdl != NULL) {
  851. ASSERT (MiReadInfo->IoMdl == FreeMdl);
  852. MiReadInfo->IoMdl = NULL;
  853. ExFreePool (FreeMdl);
  854. }
  855. #if DBG
  856. if (MiCcDebug & MI_CC_DELAY) {
  857. //
  858. // This delay provides a window to increase the chance of collided
  859. // faults.
  860. //
  861. KeDelayExecutionThread (KernelMode, FALSE, (PLARGE_INTEGER)&MmHalfSecond);
  862. }
  863. #endif
  864. //
  865. // Finish initialization of the prefetch MDL (and the API MDL).
  866. //
  867. ASSERT ((Mdl->MdlFlags & MDL_MAPPED_TO_SYSTEM_VA) == 0);
  868. Mdl->MdlFlags |= (MDL_PAGES_LOCKED | MDL_IO_PAGE_READ);
  869. ASSERT (InPageSupport->u1.e1.Completed == 0);
  870. ASSERT (InPageSupport->Thread == PsGetCurrentThread());
  871. ASSERT64 (InPageSupport->UsedPageTableEntries == 0);
  872. ASSERT (InPageSupport->WaitCount >= 1);
  873. ASSERT (InPageSupport->u1.e1.PrefetchMdlHighBits != 0);
  874. //
  875. // The API caller expects an MDL containing all the locked pages so
  876. // it can be used for a transfer.
  877. //
  878. // Note that an extra reference count is not taken on each page -
  879. // rather when the Io MDL completes, its reference counts are not
  880. // decremented (except for the dummy page). This combined with the
  881. // reference count already taken on the resident pages keeps the
  882. // accounting correct. Only if an error occurs will the Io MDL
  883. // completion decrement the reference counts.
  884. //
  885. //
  886. // Initialize the inpage support block Pfn field.
  887. //
  888. LocalPrototypePte = InPageSupport->BasePte;
  889. ASSERT (LocalPrototypePte->u.Hard.Valid == 0);
  890. ASSERT ((LocalPrototypePte->u.Soft.Prototype == 0) &&
  891. (LocalPrototypePte->u.Soft.Transition == 1));
  892. PageFrameIndex = MI_GET_PAGE_FRAME_FROM_TRANSITION_PTE(LocalPrototypePte);
  893. Pfn1 = MI_PFN_ELEMENT (PageFrameIndex);
  894. InPageSupport->Pfn = Pfn1;
  895. //
  896. // Issue the paging I/O.
  897. //
  898. ASSERT (KeGetCurrentIrql() == PASSIVE_LEVEL);
  899. status = IoAsynchronousPageRead (InPageSupport->FilePointer,
  900. Mdl,
  901. &InPageSupport->ReadOffset,
  902. &InPageSupport->Event,
  903. &InPageSupport->IoStatus);
  904. if (!NT_SUCCESS (status)) {
  905. //
  906. // Set the event as the I/O system doesn't set it on errors.
  907. // This way our caller will automatically unroll the PFN reference
  908. // counts, etc, when the MiWaitForInPageComplete returns this status.
  909. //
  910. InPageSupport->IoStatus.Status = status;
  911. InPageSupport->IoStatus.Information = 0;
  912. KeSetEvent (&InPageSupport->Event, 0, FALSE);
  913. }
  914. #if DBG
  915. if (MiCcDebug & MI_CC_DELAY) {
  916. //
  917. // This delay provides a window to increase the chance of collided
  918. // faults.
  919. //
  920. KeDelayExecutionThread (KernelMode, FALSE, (PLARGE_INTEGER)&MmHalfSecond);
  921. }
  922. #endif
  923. return STATUS_ISSUE_PAGING_IO;
  924. }
  925. NTSTATUS
  926. MiCcCompletePrefetchIos (
  927. IN PMI_READ_INFO MiReadInfo
  928. )
  929. /*++
  930. Routine Description:
  931. This routine waits for a series of page reads to complete
  932. and completes the requests.
  933. Arguments:
  934. MiReadInfo - Pointer to the read-list.
  935. Return Value:
  936. NTSTATUS of the I/O request.
  937. Environment:
  938. Kernel mode, PASSIVE_LEVEL.
  939. --*/
  940. {
  941. PMDL Mdl;
  942. PMMPFN Pfn1;
  943. PMMPFN PfnClusterPage;
  944. PPFN_NUMBER Page;
  945. NTSTATUS status;
  946. LONG NumberOfBytes;
  947. PMMINPAGE_SUPPORT InPageSupport;
  948. ASSERT (KeGetCurrentIrql() == PASSIVE_LEVEL);
  949. InPageSupport = MiReadInfo->InPageSupport;
  950. ASSERT (InPageSupport->Pfn != 0);
  951. Pfn1 = InPageSupport->Pfn;
  952. Mdl = MI_EXTRACT_PREFETCH_MDL (InPageSupport);
  953. Page = (PPFN_NUMBER)(Mdl + 1);
  954. status = MiWaitForInPageComplete (InPageSupport->Pfn,
  955. InPageSupport->BasePte,
  956. NULL,
  957. InPageSupport->BasePte,
  958. InPageSupport,
  959. PREFETCH_PROCESS);
  960. //
  961. // MiWaitForInPageComplete RETURNS WITH THE PFN LOCK HELD!!!
  962. //
  963. NumberOfBytes = (LONG)Mdl->ByteCount;
  964. while (NumberOfBytes > 0) {
  965. //
  966. // Only decrement reference counts if an error occurred.
  967. //
  968. PfnClusterPage = MI_PFN_ELEMENT (*Page);
  969. #if DBG
  970. if (PfnClusterPage->u4.InPageError) {
  971. //
  972. // If the page is marked with an error, then the whole transfer
  973. // must be marked as not successful as well. The only exception
  974. // is the prefetch dummy page which is used in multiple
  975. // transfers concurrently and thus may have the inpage error
  976. // bit set at any time (due to another transaction besides
  977. // the current one).
  978. //
  979. ASSERT ((status != STATUS_SUCCESS) ||
  980. (PfnClusterPage->PteAddress == MI_PF_DUMMY_PAGE_PTE));
  981. }
  982. #endif
  983. if (PfnClusterPage->u3.e1.ReadInProgress != 0) {
  984. ASSERT (PfnClusterPage->u4.PteFrame != MI_MAGIC_AWE_PTEFRAME);
  985. PfnClusterPage->u3.e1.ReadInProgress = 0;
  986. if (PfnClusterPage->u4.InPageError == 0) {
  987. PfnClusterPage->u1.Event = NULL;
  988. }
  989. }
  990. //
  991. // Note the reference count for each page is NOT decremented unless
  992. // the I/O failed, in which case it is done below. This allows the
  993. // MmPrefetchPagesIntoLockedMdl API to return a locked page MDL.
  994. //
  995. Page += 1;
  996. NumberOfBytes -= PAGE_SIZE;
  997. }
  998. if (status != STATUS_SUCCESS) {
  999. //
  1000. // An I/O error occurred during the page read
  1001. // operation. All the pages which were just
  1002. // put into transition must be put onto the
  1003. // free list if InPageError is set, and their
  1004. // PTEs restored to the proper contents.
  1005. //
  1006. Page = (PPFN_NUMBER)(Mdl + 1);
  1007. NumberOfBytes = (LONG)Mdl->ByteCount;
  1008. while (NumberOfBytes > 0) {
  1009. PfnClusterPage = MI_PFN_ELEMENT (*Page);
  1010. MI_REMOVE_LOCKED_PAGE_CHARGE_AND_DECREF(PfnClusterPage, 50);
  1011. if (PfnClusterPage->u4.InPageError == 1) {
  1012. if (PfnClusterPage->u3.e2.ReferenceCount == 0) {
  1013. ASSERT (PfnClusterPage->u3.e1.PageLocation ==
  1014. StandbyPageList);
  1015. MiUnlinkPageFromList (PfnClusterPage);
  1016. ASSERT (PfnClusterPage->u3.e2.ReferenceCount == 0);
  1017. MiRestoreTransitionPte (PfnClusterPage);
  1018. MiInsertPageInFreeList (*Page);
  1019. }
  1020. }
  1021. Page += 1;
  1022. NumberOfBytes -= PAGE_SIZE;
  1023. }
  1024. }
  1025. //
  1026. // All the relevant prototype PTEs should be in the transition or
  1027. // valid states and all page frames should be referenced.
  1028. // LWFIX: add code to checked build to verify this.
  1029. //
  1030. ASSERT (InPageSupport->WaitCount >= 1);
  1031. UNLOCK_PFN (PASSIVE_LEVEL);
  1032. #if DBG
  1033. InPageSupport->ListEntry.Next = NULL;
  1034. #endif
  1035. MiFreeInPageSupportBlock (InPageSupport);
  1036. MiReadInfo->InPageSupport = NULL;
  1037. return status;
  1038. }