hugetlbpage.c 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922
  1. /*
  2. * PPC Huge TLB Page Support for Kernel.
  3. *
  4. * Copyright (C) 2003 David Gibson, IBM Corporation.
  5. * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor
  6. *
  7. * Based on the IA-32 version:
  8. * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
  9. */
  10. #include <linux/mm.h>
  11. #include <linux/io.h>
  12. #include <linux/slab.h>
  13. #include <linux/hugetlb.h>
  14. #include <linux/export.h>
  15. #include <linux/of_fdt.h>
  16. #include <linux/memblock.h>
  17. #include <linux/bootmem.h>
  18. #include <linux/moduleparam.h>
  19. #include <asm/pgtable.h>
  20. #include <asm/pgalloc.h>
  21. #include <asm/tlb.h>
  22. #include <asm/setup.h>
  23. #define PAGE_SHIFT_64K 16
  24. #define PAGE_SHIFT_16M 24
  25. #define PAGE_SHIFT_16G 34
  26. unsigned int HPAGE_SHIFT;
  27. /*
  28. * Tracks gpages after the device tree is scanned and before the
  29. * huge_boot_pages list is ready. On non-Freescale implementations, this is
  30. * just used to track 16G pages and so is a single array. FSL-based
  31. * implementations may have more than one gpage size, so we need multiple
  32. * arrays
  33. */
  34. #ifdef CONFIG_PPC_FSL_BOOK3E
  35. #define MAX_NUMBER_GPAGES 128
  36. struct psize_gpages {
  37. u64 gpage_list[MAX_NUMBER_GPAGES];
  38. unsigned int nr_gpages;
  39. };
  40. static struct psize_gpages gpage_freearray[MMU_PAGE_COUNT];
  41. #else
  42. #define MAX_NUMBER_GPAGES 1024
  43. static u64 gpage_freearray[MAX_NUMBER_GPAGES];
  44. static unsigned nr_gpages;
  45. #endif
  46. static inline int shift_to_mmu_psize(unsigned int shift)
  47. {
  48. int psize;
  49. for (psize = 0; psize < MMU_PAGE_COUNT; ++psize)
  50. if (mmu_psize_defs[psize].shift == shift)
  51. return psize;
  52. return -1;
  53. }
  54. static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
  55. {
  56. if (mmu_psize_defs[mmu_psize].shift)
  57. return mmu_psize_defs[mmu_psize].shift;
  58. BUG();
  59. }
  60. #define hugepd_none(hpd) ((hpd).pd == 0)
  61. pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
  62. {
  63. pgd_t *pg;
  64. pud_t *pu;
  65. pmd_t *pm;
  66. hugepd_t *hpdp = NULL;
  67. unsigned pdshift = PGDIR_SHIFT;
  68. if (shift)
  69. *shift = 0;
  70. pg = pgdir + pgd_index(ea);
  71. if (is_hugepd(pg)) {
  72. hpdp = (hugepd_t *)pg;
  73. } else if (!pgd_none(*pg)) {
  74. pdshift = PUD_SHIFT;
  75. pu = pud_offset(pg, ea);
  76. if (is_hugepd(pu))
  77. hpdp = (hugepd_t *)pu;
  78. else if (!pud_none(*pu)) {
  79. pdshift = PMD_SHIFT;
  80. pm = pmd_offset(pu, ea);
  81. if (is_hugepd(pm))
  82. hpdp = (hugepd_t *)pm;
  83. else if (!pmd_none(*pm)) {
  84. return pte_offset_kernel(pm, ea);
  85. }
  86. }
  87. }
  88. if (!hpdp)
  89. return NULL;
  90. if (shift)
  91. *shift = hugepd_shift(*hpdp);
  92. return hugepte_offset(hpdp, ea, pdshift);
  93. }
  94. EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte);
  95. pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
  96. {
  97. return find_linux_pte_or_hugepte(mm->pgd, addr, NULL);
  98. }
  99. static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
  100. unsigned long address, unsigned pdshift, unsigned pshift)
  101. {
  102. struct kmem_cache *cachep;
  103. pte_t *new;
  104. #ifdef CONFIG_PPC_FSL_BOOK3E
  105. int i;
  106. int num_hugepd = 1 << (pshift - pdshift);
  107. cachep = hugepte_cache;
  108. #else
  109. cachep = PGT_CACHE(pdshift - pshift);
  110. #endif
  111. new = kmem_cache_zalloc(cachep, GFP_KERNEL|__GFP_REPEAT);
  112. BUG_ON(pshift > HUGEPD_SHIFT_MASK);
  113. BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
  114. if (! new)
  115. return -ENOMEM;
  116. spin_lock(&mm->page_table_lock);
  117. #ifdef CONFIG_PPC_FSL_BOOK3E
  118. /*
  119. * We have multiple higher-level entries that point to the same
  120. * actual pte location. Fill in each as we go and backtrack on error.
  121. * We need all of these so the DTLB pgtable walk code can find the
  122. * right higher-level entry without knowing if it's a hugepage or not.
  123. */
  124. for (i = 0; i < num_hugepd; i++, hpdp++) {
  125. if (unlikely(!hugepd_none(*hpdp)))
  126. break;
  127. else
  128. hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift;
  129. }
  130. /* If we bailed from the for loop early, an error occurred, clean up */
  131. if (i < num_hugepd) {
  132. for (i = i - 1 ; i >= 0; i--, hpdp--)
  133. hpdp->pd = 0;
  134. kmem_cache_free(cachep, new);
  135. }
  136. #else
  137. if (!hugepd_none(*hpdp))
  138. kmem_cache_free(cachep, new);
  139. else
  140. hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift;
  141. #endif
  142. spin_unlock(&mm->page_table_lock);
  143. return 0;
  144. }
  145. /*
  146. * These macros define how to determine which level of the page table holds
  147. * the hpdp.
  148. */
  149. #ifdef CONFIG_PPC_FSL_BOOK3E
  150. #define HUGEPD_PGD_SHIFT PGDIR_SHIFT
  151. #define HUGEPD_PUD_SHIFT PUD_SHIFT
  152. #else
  153. #define HUGEPD_PGD_SHIFT PUD_SHIFT
  154. #define HUGEPD_PUD_SHIFT PMD_SHIFT
  155. #endif
  156. pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
  157. {
  158. pgd_t *pg;
  159. pud_t *pu;
  160. pmd_t *pm;
  161. hugepd_t *hpdp = NULL;
  162. unsigned pshift = __ffs(sz);
  163. unsigned pdshift = PGDIR_SHIFT;
  164. addr &= ~(sz-1);
  165. pg = pgd_offset(mm, addr);
  166. if (pshift >= HUGEPD_PGD_SHIFT) {
  167. hpdp = (hugepd_t *)pg;
  168. } else {
  169. pdshift = PUD_SHIFT;
  170. pu = pud_alloc(mm, pg, addr);
  171. if (pshift >= HUGEPD_PUD_SHIFT) {
  172. hpdp = (hugepd_t *)pu;
  173. } else {
  174. pdshift = PMD_SHIFT;
  175. pm = pmd_alloc(mm, pu, addr);
  176. hpdp = (hugepd_t *)pm;
  177. }
  178. }
  179. if (!hpdp)
  180. return NULL;
  181. BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
  182. if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
  183. return NULL;
  184. return hugepte_offset(hpdp, addr, pdshift);
  185. }
  186. #ifdef CONFIG_PPC_FSL_BOOK3E
  187. /* Build list of addresses of gigantic pages. This function is used in early
  188. * boot before the buddy or bootmem allocator is setup.
  189. */
  190. void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
  191. {
  192. unsigned int idx = shift_to_mmu_psize(__ffs(page_size));
  193. int i;
  194. if (addr == 0)
  195. return;
  196. gpage_freearray[idx].nr_gpages = number_of_pages;
  197. for (i = 0; i < number_of_pages; i++) {
  198. gpage_freearray[idx].gpage_list[i] = addr;
  199. addr += page_size;
  200. }
  201. }
  202. /*
  203. * Moves the gigantic page addresses from the temporary list to the
  204. * huge_boot_pages list.
  205. */
  206. int alloc_bootmem_huge_page(struct hstate *hstate)
  207. {
  208. struct huge_bootmem_page *m;
  209. int idx = shift_to_mmu_psize(hstate->order + PAGE_SHIFT);
  210. int nr_gpages = gpage_freearray[idx].nr_gpages;
  211. if (nr_gpages == 0)
  212. return 0;
  213. #ifdef CONFIG_HIGHMEM
  214. /*
  215. * If gpages can be in highmem we can't use the trick of storing the
  216. * data structure in the page; allocate space for this
  217. */
  218. m = alloc_bootmem(sizeof(struct huge_bootmem_page));
  219. m->phys = gpage_freearray[idx].gpage_list[--nr_gpages];
  220. #else
  221. m = phys_to_virt(gpage_freearray[idx].gpage_list[--nr_gpages]);
  222. #endif
  223. list_add(&m->list, &huge_boot_pages);
  224. gpage_freearray[idx].nr_gpages = nr_gpages;
  225. gpage_freearray[idx].gpage_list[nr_gpages] = 0;
  226. m->hstate = hstate;
  227. return 1;
  228. }
  229. /*
  230. * Scan the command line hugepagesz= options for gigantic pages; store those in
  231. * a list that we use to allocate the memory once all options are parsed.
  232. */
  233. unsigned long gpage_npages[MMU_PAGE_COUNT];
  234. static int __init do_gpage_early_setup(char *param, char *val)
  235. {
  236. static phys_addr_t size;
  237. unsigned long npages;
  238. /*
  239. * The hugepagesz and hugepages cmdline options are interleaved. We
  240. * use the size variable to keep track of whether or not this was done
  241. * properly and skip over instances where it is incorrect. Other
  242. * command-line parsing code will issue warnings, so we don't need to.
  243. *
  244. */
  245. if ((strcmp(param, "default_hugepagesz") == 0) ||
  246. (strcmp(param, "hugepagesz") == 0)) {
  247. size = memparse(val, NULL);
  248. } else if (strcmp(param, "hugepages") == 0) {
  249. if (size != 0) {
  250. if (sscanf(val, "%lu", &npages) <= 0)
  251. npages = 0;
  252. gpage_npages[shift_to_mmu_psize(__ffs(size))] = npages;
  253. size = 0;
  254. }
  255. }
  256. return 0;
  257. }
  258. /*
  259. * This function allocates physical space for pages that are larger than the
  260. * buddy allocator can handle. We want to allocate these in highmem because
  261. * the amount of lowmem is limited. This means that this function MUST be
  262. * called before lowmem_end_addr is set up in MMU_init() in order for the lmb
  263. * allocate to grab highmem.
  264. */
  265. void __init reserve_hugetlb_gpages(void)
  266. {
  267. static __initdata char cmdline[COMMAND_LINE_SIZE];
  268. phys_addr_t size, base;
  269. int i;
  270. strlcpy(cmdline, boot_command_line, COMMAND_LINE_SIZE);
  271. parse_args("hugetlb gpages", cmdline, NULL, 0, 0, 0,
  272. &do_gpage_early_setup);
  273. /*
  274. * Walk gpage list in reverse, allocating larger page sizes first.
  275. * Skip over unsupported sizes, or sizes that have 0 gpages allocated.
  276. * When we reach the point in the list where pages are no longer
  277. * considered gpages, we're done.
  278. */
  279. for (i = MMU_PAGE_COUNT-1; i >= 0; i--) {
  280. if (mmu_psize_defs[i].shift == 0 || gpage_npages[i] == 0)
  281. continue;
  282. else if (mmu_psize_to_shift(i) < (MAX_ORDER + PAGE_SHIFT))
  283. break;
  284. size = (phys_addr_t)(1ULL << mmu_psize_to_shift(i));
  285. base = memblock_alloc_base(size * gpage_npages[i], size,
  286. MEMBLOCK_ALLOC_ANYWHERE);
  287. add_gpage(base, size, gpage_npages[i]);
  288. }
  289. }
  290. #else /* !PPC_FSL_BOOK3E */
  291. /* Build list of addresses of gigantic pages. This function is used in early
  292. * boot before the buddy or bootmem allocator is setup.
  293. */
  294. void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
  295. {
  296. if (!addr)
  297. return;
  298. while (number_of_pages > 0) {
  299. gpage_freearray[nr_gpages] = addr;
  300. nr_gpages++;
  301. number_of_pages--;
  302. addr += page_size;
  303. }
  304. }
  305. /* Moves the gigantic page addresses from the temporary list to the
  306. * huge_boot_pages list.
  307. */
  308. int alloc_bootmem_huge_page(struct hstate *hstate)
  309. {
  310. struct huge_bootmem_page *m;
  311. if (nr_gpages == 0)
  312. return 0;
  313. m = phys_to_virt(gpage_freearray[--nr_gpages]);
  314. gpage_freearray[nr_gpages] = 0;
  315. list_add(&m->list, &huge_boot_pages);
  316. m->hstate = hstate;
  317. return 1;
  318. }
  319. #endif
  320. int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
  321. {
  322. return 0;
  323. }
  324. #ifdef CONFIG_PPC_FSL_BOOK3E
  325. #define HUGEPD_FREELIST_SIZE \
  326. ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))
  327. struct hugepd_freelist {
  328. struct rcu_head rcu;
  329. unsigned int index;
  330. void *ptes[0];
  331. };
  332. static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur);
  333. static void hugepd_free_rcu_callback(struct rcu_head *head)
  334. {
  335. struct hugepd_freelist *batch =
  336. container_of(head, struct hugepd_freelist, rcu);
  337. unsigned int i;
  338. for (i = 0; i < batch->index; i++)
  339. kmem_cache_free(hugepte_cache, batch->ptes[i]);
  340. free_page((unsigned long)batch);
  341. }
  342. static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
  343. {
  344. struct hugepd_freelist **batchp;
  345. batchp = &__get_cpu_var(hugepd_freelist_cur);
  346. if (atomic_read(&tlb->mm->mm_users) < 2 ||
  347. cpumask_equal(mm_cpumask(tlb->mm),
  348. cpumask_of(smp_processor_id()))) {
  349. kmem_cache_free(hugepte_cache, hugepte);
  350. return;
  351. }
  352. if (*batchp == NULL) {
  353. *batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC);
  354. (*batchp)->index = 0;
  355. }
  356. (*batchp)->ptes[(*batchp)->index++] = hugepte;
  357. if ((*batchp)->index == HUGEPD_FREELIST_SIZE) {
  358. call_rcu_sched(&(*batchp)->rcu, hugepd_free_rcu_callback);
  359. *batchp = NULL;
  360. }
  361. }
  362. #endif
  363. static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
  364. unsigned long start, unsigned long end,
  365. unsigned long floor, unsigned long ceiling)
  366. {
  367. pte_t *hugepte = hugepd_page(*hpdp);
  368. int i;
  369. unsigned long pdmask = ~((1UL << pdshift) - 1);
  370. unsigned int num_hugepd = 1;
  371. #ifdef CONFIG_PPC_FSL_BOOK3E
  372. /* Note: On fsl the hpdp may be the first of several */
  373. num_hugepd = (1 << (hugepd_shift(*hpdp) - pdshift));
  374. #else
  375. unsigned int shift = hugepd_shift(*hpdp);
  376. #endif
  377. start &= pdmask;
  378. if (start < floor)
  379. return;
  380. if (ceiling) {
  381. ceiling &= pdmask;
  382. if (! ceiling)
  383. return;
  384. }
  385. if (end - 1 > ceiling - 1)
  386. return;
  387. for (i = 0; i < num_hugepd; i++, hpdp++)
  388. hpdp->pd = 0;
  389. tlb->need_flush = 1;
  390. #ifdef CONFIG_PPC_FSL_BOOK3E
  391. hugepd_free(tlb, hugepte);
  392. #else
  393. pgtable_free_tlb(tlb, hugepte, pdshift - shift);
  394. #endif
  395. }
  396. static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
  397. unsigned long addr, unsigned long end,
  398. unsigned long floor, unsigned long ceiling)
  399. {
  400. pmd_t *pmd;
  401. unsigned long next;
  402. unsigned long start;
  403. start = addr;
  404. do {
  405. pmd = pmd_offset(pud, addr);
  406. next = pmd_addr_end(addr, end);
  407. if (pmd_none(*pmd))
  408. continue;
  409. #ifdef CONFIG_PPC_FSL_BOOK3E
  410. /*
  411. * Increment next by the size of the huge mapping since
  412. * there may be more than one entry at this level for a
  413. * single hugepage, but all of them point to
  414. * the same kmem cache that holds the hugepte.
  415. */
  416. next = addr + (1 << hugepd_shift(*(hugepd_t *)pmd));
  417. #endif
  418. free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
  419. addr, next, floor, ceiling);
  420. } while (addr = next, addr != end);
  421. start &= PUD_MASK;
  422. if (start < floor)
  423. return;
  424. if (ceiling) {
  425. ceiling &= PUD_MASK;
  426. if (!ceiling)
  427. return;
  428. }
  429. if (end - 1 > ceiling - 1)
  430. return;
  431. pmd = pmd_offset(pud, start);
  432. pud_clear(pud);
  433. pmd_free_tlb(tlb, pmd, start);
  434. }
  435. static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
  436. unsigned long addr, unsigned long end,
  437. unsigned long floor, unsigned long ceiling)
  438. {
  439. pud_t *pud;
  440. unsigned long next;
  441. unsigned long start;
  442. start = addr;
  443. do {
  444. pud = pud_offset(pgd, addr);
  445. next = pud_addr_end(addr, end);
  446. if (!is_hugepd(pud)) {
  447. if (pud_none_or_clear_bad(pud))
  448. continue;
  449. hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
  450. ceiling);
  451. } else {
  452. #ifdef CONFIG_PPC_FSL_BOOK3E
  453. /*
  454. * Increment next by the size of the huge mapping since
  455. * there may be more than one entry at this level for a
  456. * single hugepage, but all of them point to
  457. * the same kmem cache that holds the hugepte.
  458. */
  459. next = addr + (1 << hugepd_shift(*(hugepd_t *)pud));
  460. #endif
  461. free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
  462. addr, next, floor, ceiling);
  463. }
  464. } while (addr = next, addr != end);
  465. start &= PGDIR_MASK;
  466. if (start < floor)
  467. return;
  468. if (ceiling) {
  469. ceiling &= PGDIR_MASK;
  470. if (!ceiling)
  471. return;
  472. }
  473. if (end - 1 > ceiling - 1)
  474. return;
  475. pud = pud_offset(pgd, start);
  476. pgd_clear(pgd);
  477. pud_free_tlb(tlb, pud, start);
  478. }
  479. /*
  480. * This function frees user-level page tables of a process.
  481. *
  482. * Must be called with pagetable lock held.
  483. */
  484. void hugetlb_free_pgd_range(struct mmu_gather *tlb,
  485. unsigned long addr, unsigned long end,
  486. unsigned long floor, unsigned long ceiling)
  487. {
  488. pgd_t *pgd;
  489. unsigned long next;
  490. /*
  491. * Because there are a number of different possible pagetable
  492. * layouts for hugepage ranges, we limit knowledge of how
  493. * things should be laid out to the allocation path
  494. * (huge_pte_alloc(), above). Everything else works out the
  495. * structure as it goes from information in the hugepd
  496. * pointers. That means that we can't here use the
  497. * optimization used in the normal page free_pgd_range(), of
  498. * checking whether we're actually covering a large enough
  499. * range to have to do anything at the top level of the walk
  500. * instead of at the bottom.
  501. *
  502. * To make sense of this, you should probably go read the big
  503. * block comment at the top of the normal free_pgd_range(),
  504. * too.
  505. */
  506. do {
  507. next = pgd_addr_end(addr, end);
  508. pgd = pgd_offset(tlb->mm, addr);
  509. if (!is_hugepd(pgd)) {
  510. if (pgd_none_or_clear_bad(pgd))
  511. continue;
  512. hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
  513. } else {
  514. #ifdef CONFIG_PPC_FSL_BOOK3E
  515. /*
  516. * Increment next by the size of the huge mapping since
  517. * there may be more than one entry at the pgd level
  518. * for a single hugepage, but all of them point to the
  519. * same kmem cache that holds the hugepte.
  520. */
  521. next = addr + (1 << hugepd_shift(*(hugepd_t *)pgd));
  522. #endif
  523. free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
  524. addr, next, floor, ceiling);
  525. }
  526. } while (addr = next, addr != end);
  527. }
  528. struct page *
  529. follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
  530. {
  531. pte_t *ptep;
  532. struct page *page;
  533. unsigned shift;
  534. unsigned long mask;
  535. ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift);
  536. /* Verify it is a huge page else bail. */
  537. if (!ptep || !shift)
  538. return ERR_PTR(-EINVAL);
  539. mask = (1UL << shift) - 1;
  540. page = pte_page(*ptep);
  541. if (page)
  542. page += (address & mask) / PAGE_SIZE;
  543. return page;
  544. }
  545. int pmd_huge(pmd_t pmd)
  546. {
  547. return 0;
  548. }
  549. int pud_huge(pud_t pud)
  550. {
  551. return 0;
  552. }
  553. struct page *
  554. follow_huge_pmd(struct mm_struct *mm, unsigned long address,
  555. pmd_t *pmd, int write)
  556. {
  557. BUG();
  558. return NULL;
  559. }
  560. static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
  561. unsigned long end, int write, struct page **pages, int *nr)
  562. {
  563. unsigned long mask;
  564. unsigned long pte_end;
  565. struct page *head, *page, *tail;
  566. pte_t pte;
  567. int refs;
  568. pte_end = (addr + sz) & ~(sz-1);
  569. if (pte_end < end)
  570. end = pte_end;
  571. pte = *ptep;
  572. mask = _PAGE_PRESENT | _PAGE_USER;
  573. if (write)
  574. mask |= _PAGE_RW;
  575. if ((pte_val(pte) & mask) != mask)
  576. return 0;
  577. /* hugepages are never "special" */
  578. VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
  579. refs = 0;
  580. head = pte_page(pte);
  581. page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
  582. tail = page;
  583. do {
  584. VM_BUG_ON(compound_head(page) != head);
  585. pages[*nr] = page;
  586. (*nr)++;
  587. page++;
  588. refs++;
  589. } while (addr += PAGE_SIZE, addr != end);
  590. if (!page_cache_add_speculative(head, refs)) {
  591. *nr -= refs;
  592. return 0;
  593. }
  594. if (unlikely(pte_val(pte) != pte_val(*ptep))) {
  595. /* Could be optimized better */
  596. *nr -= refs;
  597. while (refs--)
  598. put_page(head);
  599. return 0;
  600. }
  601. /*
  602. * Any tail page need their mapcount reference taken before we
  603. * return.
  604. */
  605. while (refs--) {
  606. if (PageTail(tail))
  607. get_huge_page_tail(tail);
  608. tail++;
  609. }
  610. return 1;
  611. }
  612. static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
  613. unsigned long sz)
  614. {
  615. unsigned long __boundary = (addr + sz) & ~(sz-1);
  616. return (__boundary - 1 < end - 1) ? __boundary : end;
  617. }
  618. int gup_hugepd(hugepd_t *hugepd, unsigned pdshift,
  619. unsigned long addr, unsigned long end,
  620. int write, struct page **pages, int *nr)
  621. {
  622. pte_t *ptep;
  623. unsigned long sz = 1UL << hugepd_shift(*hugepd);
  624. unsigned long next;
  625. ptep = hugepte_offset(hugepd, addr, pdshift);
  626. do {
  627. next = hugepte_addr_end(addr, end, sz);
  628. if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
  629. return 0;
  630. } while (ptep++, addr = next, addr != end);
  631. return 1;
  632. }
  633. #ifdef CONFIG_PPC_MM_SLICES
  634. unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
  635. unsigned long len, unsigned long pgoff,
  636. unsigned long flags)
  637. {
  638. struct hstate *hstate = hstate_file(file);
  639. int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
  640. return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0);
  641. }
  642. #endif
  643. unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
  644. {
  645. #ifdef CONFIG_PPC_MM_SLICES
  646. unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);
  647. return 1UL << mmu_psize_to_shift(psize);
  648. #else
  649. if (!is_vm_hugetlb_page(vma))
  650. return PAGE_SIZE;
  651. return huge_page_size(hstate_vma(vma));
  652. #endif
  653. }
  654. static inline bool is_power_of_4(unsigned long x)
  655. {
  656. if (is_power_of_2(x))
  657. return (__ilog2(x) % 2) ? false : true;
  658. return false;
  659. }
  660. static int __init add_huge_page_size(unsigned long long size)
  661. {
  662. int shift = __ffs(size);
  663. int mmu_psize;
  664. /* Check that it is a page size supported by the hardware and
  665. * that it fits within pagetable and slice limits. */
  666. #ifdef CONFIG_PPC_FSL_BOOK3E
  667. if ((size < PAGE_SIZE) || !is_power_of_4(size))
  668. return -EINVAL;
  669. #else
  670. if (!is_power_of_2(size)
  671. || (shift > SLICE_HIGH_SHIFT) || (shift <= PAGE_SHIFT))
  672. return -EINVAL;
  673. #endif
  674. if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
  675. return -EINVAL;
  676. #ifdef CONFIG_SPU_FS_64K_LS
  677. /* Disable support for 64K huge pages when 64K SPU local store
  678. * support is enabled as the current implementation conflicts.
  679. */
  680. if (shift == PAGE_SHIFT_64K)
  681. return -EINVAL;
  682. #endif /* CONFIG_SPU_FS_64K_LS */
  683. BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);
  684. /* Return if huge page size has already been setup */
  685. if (size_to_hstate(size))
  686. return 0;
  687. hugetlb_add_hstate(shift - PAGE_SHIFT);
  688. return 0;
  689. }
  690. static int __init hugepage_setup_sz(char *str)
  691. {
  692. unsigned long long size;
  693. size = memparse(str, &str);
  694. if (add_huge_page_size(size) != 0)
  695. printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size);
  696. return 1;
  697. }
  698. __setup("hugepagesz=", hugepage_setup_sz);
  699. #ifdef CONFIG_PPC_FSL_BOOK3E
  700. struct kmem_cache *hugepte_cache;
  701. static int __init hugetlbpage_init(void)
  702. {
  703. int psize;
  704. for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
  705. unsigned shift;
  706. if (!mmu_psize_defs[psize].shift)
  707. continue;
  708. shift = mmu_psize_to_shift(psize);
  709. /* Don't treat normal page sizes as huge... */
  710. if (shift != PAGE_SHIFT)
  711. if (add_huge_page_size(1ULL << shift) < 0)
  712. continue;
  713. }
  714. /*
  715. * Create a kmem cache for hugeptes. The bottom bits in the pte have
  716. * size information encoded in them, so align them to allow this
  717. */
  718. hugepte_cache = kmem_cache_create("hugepte-cache", sizeof(pte_t),
  719. HUGEPD_SHIFT_MASK + 1, 0, NULL);
  720. if (hugepte_cache == NULL)
  721. panic("%s: Unable to create kmem cache for hugeptes\n",
  722. __func__);
  723. /* Default hpage size = 4M */
  724. if (mmu_psize_defs[MMU_PAGE_4M].shift)
  725. HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift;
  726. else
  727. panic("%s: Unable to set default huge page size\n", __func__);
  728. return 0;
  729. }
  730. #else
  731. static int __init hugetlbpage_init(void)
  732. {
  733. int psize;
  734. if (!mmu_has_feature(MMU_FTR_16M_PAGE))
  735. return -ENODEV;
  736. for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
  737. unsigned shift;
  738. unsigned pdshift;
  739. if (!mmu_psize_defs[psize].shift)
  740. continue;
  741. shift = mmu_psize_to_shift(psize);
  742. if (add_huge_page_size(1ULL << shift) < 0)
  743. continue;
  744. if (shift < PMD_SHIFT)
  745. pdshift = PMD_SHIFT;
  746. else if (shift < PUD_SHIFT)
  747. pdshift = PUD_SHIFT;
  748. else
  749. pdshift = PGDIR_SHIFT;
  750. pgtable_cache_add(pdshift - shift, NULL);
  751. if (!PGT_CACHE(pdshift - shift))
  752. panic("hugetlbpage_init(): could not create "
  753. "pgtable cache for %d bit pagesize\n", shift);
  754. }
  755. /* Set default large page size. Currently, we pick 16M or 1M
  756. * depending on what is available
  757. */
  758. if (mmu_psize_defs[MMU_PAGE_16M].shift)
  759. HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
  760. else if (mmu_psize_defs[MMU_PAGE_1M].shift)
  761. HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
  762. return 0;
  763. }
  764. #endif
  765. module_init(hugetlbpage_init);
  766. void flush_dcache_icache_hugepage(struct page *page)
  767. {
  768. int i;
  769. void *start;
  770. BUG_ON(!PageCompound(page));
  771. for (i = 0; i < (1UL << compound_order(page)); i++) {
  772. if (!PageHighMem(page)) {
  773. __flush_dcache_icache(page_address(page+i));
  774. } else {
  775. start = kmap_atomic(page+i);
  776. __flush_dcache_icache(start);
  777. kunmap_atomic(start);
  778. }
  779. }
  780. }