docs/backpack/backpack-impl.tex


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519

\documentclass{article}

\usepackage{pifont}
\usepackage{graphicx} %[pdftex] OR [dvips]
\usepackage{fullpage}
\usepackage{wrapfig}
\usepackage{float}
\usepackage{titling}
\usepackage{hyperref}
\usepackage{tikz}
\usepackage{color}
\usepackage{footnote}
\usepackage{float}
\usepackage{algorithm}
\usepackage{algpseudocode}
\usetikzlibrary{arrows}
\usetikzlibrary{positioning}
\setlength{\droptitle}{-6em}

\input{commands-new-new.tex}

\newcommand{\nuAA}{\nu_\mathit{AA}}
\newcommand{\nuAB}{\nu_\mathit{AB}}
\newcommand{\nuGA}{\nu_\mathit{GA}}
\newcommand{\nuGB}{\nu_\mathit{GB}}
\newcommand{\betaPL}{\beta_\mathit{PL}}
\newcommand{\betaAA}{\beta_\mathit{AA}}
\newcommand{\betaAS}{\beta_\mathit{AS}}
\newcommand{\thinandalso}{\hspace{.45cm}}
\newcommand{\thinnerandalso}{\hspace{.38cm}}

\input{commands-rebindings.tex}

\newcommand{\var}[1]{\textsf{#1}}

\newcommand{\ghcfile}[1]{\textsl{#1}}

\title{Implementing Backpack}

\begin{document}

\maketitle

The purpose of this document is to describe an implementation path
for Backpack in GHC\@.

\tableofcontents

\section{What we are trying to solve}

While the current ecosystem has proved itself serviceable for many years,
there are a number of major problems which causes significant headaches
for many users.  Here are some of them:

\subsection{Package reinstalls are destructive}\label{sec:destructive}

When attempting to install a new package, you might get an error like
this:

\begin{verbatim}
$ cabal install hakyll
cabal: The following packages are likely to be broken by the reinstalls:
pandoc-1.9.4.5
Graphalyze-0.14.0.0
Use --force-reinstalls if you want to install anyway.
\end{verbatim}

While this error message is understandable if you're really trying to
reinstall a package, it is quite surprising that it can occur even if
you didn't ask for any reinstalls!

The underlying cause of this problem is related to an invariant Cabal
currently enforces on a package database: there can only be one instance
of a package for any given package name and version.  This means that it
is not possible to install a package multiple times, compiled against
different dependencies.  However, sometimes, reinstalling a package with
different dependencies is the only way to fulfill version bounds of a
package!  For example: say we have three packages \pname{a}, \pname{b}
and \pname{c}.  \pname{b-1.0} is the only version of \pname{b}
available, and it has been installed and compiled against \pname{c-1.0}.
Later, the user installs an updated version \pname{c-1.1} and then
attempts to install \pname{a}, which depends on the specific versions
\pname{c-1.1} and \pname{b-1.0}.  We \emph{cannot} use the already
installed version of \pname{b-1.0}, which depends on the wrong version
of \pname{c}, so our only choice is to reinstall \pname{b-1.0} compiled
against \pname{c-1.1}.  This will break any packages, e.g. \pname{d},
which were built against the old version of \pname{b-1.0}.

Our solution to this problem is to \emph{abolish} destructive package
installs, and allow a package to be installed multiple times with the same
package name and version.  However, allowing this poses some interesting
user interface problems, since package IDs are now no longer unambiguous
identifiers.

\subsection{Version bounds are often over/under-constrained}

When attempting to install a new package, Cabal might fail in this way:

\begin{verbatim}
$ cabal install hledger-0.18
Resolving dependencies...
cabal: Could not resolve dependencies:
# pile of output
\end{verbatim}

There are a number of possible reasons why this could occur, but usually
it's because some of the packages involved have over-constrained version
bounds, which are resulting in an unsatisfiable set of constraints (or,
at least, Cabal gave up backtracking before it found a solution.)  To
add insult to injury, most of the time the bound is nonsense and removing
it would result in a working compilation.  In fact, this situation is
so common that Cabal has a flag \verb|--allow-newer| which lets you
override the package upper bounds.

However, the flip-side is when Cabal finds a satisfying set, but your
compilation fails with a type error.  Here, you had an under-constrained
set of version bounds which didn't actually reflect the compatible
versions of a package, and Cabal picked a version of the package which
was incompatible.

Our solution to this problem is to use signatures instead of version
numbers as the primary mechanism by which compatibility is determined:
e.g., if it typechecks, it's a valid choice.  Version numbers can still
be used to reflect semantic changes not seen in the types (in
particular, ruling out buggy versions of a package is a useful
operation), but these bounds are empirical observations and can be
collected after-the-fact.

\subsection{It is difficult to support multiple implementations of a type}

This problem is perhaps best described by referring to a particular
instance of it Haskell's ecosystem: the \texttt{String} data type.  Haskell,
by default, implements strings as linked lists of integers (representing
characters).  Many libraries use \texttt{String}, because it's very
convenient to program against.  However, this representation is also
very \emph{slow}, so there are alternative implementations such as
\texttt{Text} which implement efficient, UTF-8 encoded packed byte
arrays.

Now, suppose you are writing a library and you don't care if the user of
your library is using \texttt{String} or \texttt{Text}.  However, you
don't want to rewrite your library twice to support both data types:
rather, you'd like to rely on some \emph{common interface} between the
two types, and let the user instantiate the implementation.  The only
way to do this in today's Haskell is using type classes; however, this
necessitates rewriting all type signatures from a nice \texttt{String ->
String} to \texttt{StringLike s => s -> s}.  The result is less readable,
required a large number of trivial edits to type signatures, and might
even be less efficient, if GHC does not appropriately specialize your code
written in this style.

Our solution to this problem is to introduce a new mechanism of
pluggability: module holes, which let us use types and functions from a
module \texttt{Data.String} as before, but defer choosing \emph{what}
module should be used in the implementation to some later point (or
instantiate the code multiple times with different choices.)

\subsection{Fast moving APIs are difficult to develop/develop against}

Most packages that are uploaded to Hackage have package authors which pay
some amount of attention to backwards compatibility and avoid making egregious
breaking changes.  However, a package like the \verb|ghc-api| follows a
very different model: the library is a treated by its developers as an
internal component of an application (GHC), and is frequently refactored
in a way that changes its outwards facing interface.

Arguably, an application like GHC should design a stable API and
maintain backwards compatibility against it.  However, this is a lot of
work (including refactoring) which is only being done slowly, and in the
meantime, the dump of all the modules gives users the functionality they
want (even if it keeps breaking every version.)

One could say that the core problem is there is no way for users to
easily communicate to GHC authors what parts of the API they rely on.  A
developer of GHC who is refactoring an interface will often rely on the
typechecker to let them know which parts of the codebase they need to
follow and update, and often could say precisely how to update code to
use the new interface.  User applications, which live out of tree, don't
receive this level of attention.

Our solution is to make it possible to typecheck the GHC API against a
signature.  Important consumers can publish what subsets of the GHC API
they rely against, and developers of GHC, as part of their normal build
process, type-check against these signatures.  If the signature breaks,
a developer can either do the refactoring differently to avoid the
compatibility-break, or document how to update code to use the new API\@.

\section{Backpack in a nutshell}

For a more in-depth tutorial about Backpack's features, check out Section 2
of the original Backpack paper.  In this section, we briefly review the
most important points of Backpack's design.

\paragraph{Thinning and renaming at the module level}
A user can specify a build dependency which only exposes a subset of
modules (possibly under different names.)  By itself, it's a way for the
user to resolve ambiguous module imports at the package level, without
having to use the \texttt{PackageImports} syntax extension.

\paragraph{Holes (abstract module definitions)}  The core
component of Backpack's support for \emph{separate modular development}
is the ability to specify abstract module bindings, or holes, which give
users of the module an obligation to provide an implementation which
fulfills the signature of the hole.  In this example:

\begin{verbatim}
package p where
    A :: [ ... ]
    B = [ import A; ... ]
\end{verbatim}

\verb|p| is an \emph{indefinite package}, which cannot be compiled until
an implementation of \m{A} is provided.  However, we can still type check
\m{B} without any implementation of \m{A}, by type checking it against
the signature.  Holes can be put into signature packages and included
(depended upon) by other packages to reuse definitions of signatures.

\paragraph{Filling in holes with an implementation}
A hole in an indefinite package can be instantiated in a \emph{mix-in}
style: namely, if a signature and an implementation have the same name,
they are linked together:

\begin{verbatim}
package q where
    A = [ ... ]
    include p -- has signature A
\end{verbatim}

Renaming is often useful to rename a module (or a hole) so that a signature
and implementation have the same name and are linked together.
An indefinite package can be instantiated multiple times with different
implementations: the \emph{applicativity} of Backpack means that if
a package is instantiated separately with the same module, the results
are type equal:

\begin{verbatim}
package q' where
    A = [ ... ]
    include p (A, B as B1)
    include p (A, B as B2)
    -- B1 and B2 are equivalent
\end{verbatim}

\paragraph{Combining signatures together}
Unlike implementations, it's valid for a multiple signatures with the
same name to be in scope.

\begin{verbatim}
package a-sig where
    A :: [ ... ]
package a-sig2 where
    A :: [ ... ]
package q where
    include a-sig
    include a-sig2
    B = [ import A; ... ]
\end{verbatim}

These signatures \emph{merge} together, providing the union of the
functionality (assuming the types of individual entities are
compatible.)  Backpack has a very simple merging algorithm: types must
match exactly to be compatible (\emph{width} subtyping).

\clearpage

\section{Module and package identity}

\begin{figure}[H]
\begin{tabular}{p{0.45\textwidth} p{0.45\textwidth}}
\begin{verbatim}
package p where
    A :: [ data X ]
    P = [ import A; data Y = Y X ]
package q where
    A1 = [ data X = X1 ]
    A2 = [ data X = X2 ]
    include p (A as A1, P as P1)
    include p (A as A2, P as P2)
\end{verbatim}
&
\begin{verbatim}
package p where
    A :: [ data X ]
    P = [ data T = T ] -- no A import!
package q where
    A1 = [ data X = X1 ]
    A2 = [ data X = X2 ]
    include p (A as A1, P as P1)
    include p (A as A2, P as P2)
\end{verbatim}
\\
(a) Type equality must consider holes\ldots &
(b) \ldots but how do we track dependencies? \\
\end{tabular}
\caption{Two similar examples}\label{fig:simple-ex}
\end{figure}

One of the central questions one encounters when type checking Haskell
code is: \emph{when are two types equal}?  In ordinary Haskell, the
answer is simple: ``They are equal if their \emph{original names} (i.e.,
where they were originally defined) are the same.''  However, in
Backpack, the situation is murkier due to the presence of \emph{holes}.
Consider the pair of examples in Figure~\ref{fig:simple-ex}.
In Figure~\ref{fig:simple-ex}a,  the types \m{B1}.\verb|Y| and \m{B2}.\verb|Y| should not be
considered equal, even though na\"\i vely their original names are
\pname{p}:\m{B}.\verb|Y|, since their arguments are different \verb|X|'s!
On the other hand, if we instantiated \pname{p} twice with the same \m{A}
(e.g., change the second include to \texttt{include p (A as A1, P as P2)}),
we might consider the two resulting \verb|Y|'s
equal, an \emph{applicative} semantics of identity instantiation.  In
Figure~\ref{fig:simple-ex}b, we see that even though \m{A} was instantiated differently,
we might reasonably wonder if \texttt{T} should still be considered the same,
since it has no dependence on the actual choice of \m{A}.

In fact, there are quite a few different choices that can be made here.
Figures~\ref{fig:applicativity}~and~\ref{fig:granularity} summarize the various
choices on two axes: the granularity of applicativity (under what circumstances
do we consider two types equal) and the granularity of dependency (what circumstances
do we consider two types not equal)?  A \ding{52} means the design we have chosen
answers the question affirmatively---\ding{54}, negatively---but all of these choices
are valid points on the design space.

\subsection{The granularity of applicativity}

An applicative semantics of package instantiation states that if a package is
instantiated with the ``same arguments'', then the resulting entities it defines
should also be considered equal.  Because Backpack uses \emph{mix-in modules},
it is very natural to consider the arguments of a package instantiation as the
modules, as shown in Figure~\ref{fig:applicativity}b: the same module \m{A} is
linked for both instantiations, so \m{P1} and \m{P2} are considered equal.

However, we consider the situation at a finer granularity, we might say, ``Well,
for a declaration \texttt{data Y = Y X}, only the definition of type \verb|X| matters.
If they are the same, then \verb|Y| is the same.''  In that case, we might accept
that in Figure~\ref{fig:applicativity}a, even though \pname{p} is instantiated
with different modules, at the end of the day, the important component \verb|X| is
the same in both cases, so \verb|Y| should also be the same.  This is a sort of
``extreme'' view of modular development, where every declaration is desugared
into a separate module.  In our design, we will be a bit more conservative, and
continue with module level applicativity, in the same manner as Paper Backpack.

\paragraph{Implementation considerations}
Compiling Figure~\ref{fig:applicativity}b to dynamic libraries poses an
interesting challenge, if every package compiles to a dynamic library.
When we compile package \pname{q}, the libraries we end up producing are \pname{q}
and an instance of \pname{p} (instantiated with \pname{q}:\m{A}).  Furthermore,
\pname{q} refers to code in \pname{p} (the import in \m{Q}), and vice versa (the usage
of the instantiated hole \m{A}).  When building static libraries, this circular
dependency doesn't matter: when we link the executable, we can resolve all
of the symbols in one go.  However, when the libraries in question are
dynamic libraries \verb|libHSq.so| and \verb|libHSp(q:A).so|, we now have
a \emph{circular dependency} between the two dynamic libraries, and most dynamic
linkers will not be able to load either of these libraries.

To break the circularity in Figure~\ref{fig:applicativity}b, we have to \emph{inline}
the entire module \m{A} into the instance of \pname{p}.  Since the code is exactly
the same, we can still consider the instance of \m{A} in \pname{q} and in \pname{p}
type equal.  However, in Figure~\ref{fig:applicativity}c, applicativity has been
done at a coarser level: although we are using Backpack's module mixin syntax,
morally, this example is filling in the holes with the \emph{package} \pname{a}
(rather than a module).  In this case, we can achieve code sharing, since
\pname{p} can refer directly to \pname{a}, breaking the circularity.

\newcolumntype{C}{>{\centering\arraybackslash}p{0.3\textwidth}}
    \begin{savenotes}
\begin{figure}
    \begin{tabular}{C C C}
\begin{verbatim}
package q where
  A = [ data X = X ]
  A1 = [ import A; x = 0 ]
  A2 = [ import A; x = 1 ]
  include p (A as A1, P as P1)
  include p (A as A2, P as P2)
  Q = [ import P1; ... ]
\end{verbatim}
&
\begin{verbatim}
package q where
  A = [ data X = X ]


  include p (A, P as P1)
  include p (A, P as P2)
  Q = [ import P1; ... ]
\end{verbatim}
&
\begin{verbatim}
package a where
  A = [ data X = X ]
package q where
  include a
  include p (A, P as P1)
  include p (A, P as P2)
  Q = [ import P1; ... ]
\end{verbatim}
  \\
  (a) Declaration applicativity \ding{54} &
  (b) Module applicativity \ding{52} &
  (c) Package applicativity \ding{52} \\
\end{tabular}
\caption{Choices of granularity of applicativity on \pname{p}: given \texttt{data Y = Y X},  is \m{P1}.\texttt{Y} equal to \m{P2}.\texttt{Y}?}\label{fig:applicativity}
\end{figure}
\end{savenotes}

\subsection{The granularity of dependency}

\begin{savenotes}
\newcolumntype{C}{>{\centering\arraybackslash}p{0.3\textwidth}}
\begin{figure}
    \begin{tabular}{C C C}
\begin{verbatim}
package p(A,P) where
  A :: [ data X ]
  P = [
    import A
    data T = T
    data Y = Y X
  ]
\end{verbatim}
&
\begin{verbatim}
package p(A,P) where
  A :: [ data X ]
  B = [ data T = T ]
  C = [
    import A
    data Y = Y X
  ]
  P = [
    import B
    import C
  ]
\end{verbatim}
&
\begin{verbatim}
package b where
  B = [ data T = T ]
package c where
  A :: [ data X ]
  C = [
    import A
    data Y = Y X
  ]
package p(A,P) where
  include b; include c
  P = [ import B; import C ]
\end{verbatim}
  \\
  (a) Declaration granularity \ding{54} &
  (b) Module granularity \ding{54} &
  (c) Package granularity \ding{52} \\
\end{tabular}
\caption{Choices of granularity for dependency: is the identity of \texttt{T} independent of how \m{A} is instantiated?}\label{fig:granularity}
\end{figure}

\end{savenotes}

In the previous section, we considered \emph{what} entities may be considered for
computing dependency; in this section we consider \emph{which} entities are actually
considered as part of the dependencies for the declaration/module/package we're writing.
Figure~\ref{fig:granularity} contains a series of examples which exemplify the choice
of whether or not to collect dependencies on a per-declaration, per-module or per-package basis:

\begin{itemize}
    \item Package-level granularity states that the modules in a package are
considered to depend on \emph{all} of the holes in the package, even if
the hole is never imported.  Figure~\ref{fig:granularity}c is factored so that
\verb|T| is defined in a distinct package \pname{b} with no holes, so no matter
the choice of \m{A}, \m{B}.\verb|T| will be the same.  On the other hand, in
Figure~\ref{fig:granularity}b, there is a hole in the package defining \m{B},
so the identity of \verb|T| will depend on the choice of \m{A}.

\item Module-level granularity states that each module has its own dependency,
computed by looking at its import statements.  In this setting, \verb|T| in Figure~\ref{fig:granularity}b
is independent of \m{A}, since the hole is never imported in \m{B}.  But once again, in
Figure~\ref{fig:granularity}a, there is an import in the module defining \verb|T|,
so the identity of \verb|T| once again depends on the choice of \m{A}.

\item Finally, at the finest level of granularity, one could chop up \pname{p} in
Figure~\ref{fig:granularity}a, looking at the type declaration-level dependency
to suss out whether or not \verb|T| depends on \m{A}.  It doesn't refer to
anything in \m{A}, so it is always considered the same.
\end{itemize}

It is well worth noting that the system described by Paper Backpack tracks dependencies per module;
however, we have decided that we will implement tracking per package instead:
a coarser grained granularity which accepts less programs.

Is a finer form of granularity \emph{better?} Not necessarily!  For
one, we can always split packages into further subpackages (as was done
in Figure~\ref{fig:granularity}c) which better reflect the internal hole
dependencies, so it is always possible to rewrite a program to make it
typecheck---just with more packages.  Additionally, the finer the
granularity of dependency, the more work I have to do to understand what
the identities of entities in a module are.  In Paper Backpack, I have
to understand the imports of all modules in a package; with
declaration-granularity, I have to understand the entire code.  This is
a lot of work for the developer to think about; a more granular model is
easier to remember and reason about.  Finally, package-level granularity
is much easier to implement, as it preserves the previous compilation
model, \emph{one library per package}.  At a fine level of granularity, we
may end up repeatedly compiling a module which actually should be considered
``the same'' as any other instance of it.

Nevertheless, finer granularity can be desirable from an end-user perspective.
Usually, these circumstances arise when library-writers are forced to split their
components into many separate packages, when they would much rather have written
a single package.  For example, if I define a data type in my library, and would
like to define a \verb|Lens| instance for it, I would create a new package just
for the instance, in order to avoid saddling users who aren't interested in lenses
with an extra dependency.  Another example is test suites, which have dependencies
on various test frameworks that a user won't care about if they are not planning
on testing the code. (Cabal has a special case for this, allowing the user
to write effectively multiple packages in a single Cabal file.)

\subsection{Summary}

We can summarize all of the various schemes by describing the internal data
types that would be defined by GHC under each regime.  First, we have
the shared data structures, which correspond closely to what users are
used to seeing:

\begin{verbatim}
<pkg-name>   ::= containers, ...
<pkg-version ::= 1.0, ...
<pkg-id>     ::= <pkg-name>-<pkg-version>
<mod-name>   ::= Data.Set, ...
<occ>        ::= empty, ...
\end{verbatim}

Changing the \textbf{granularity of applicativity} modifies how we represent the
list of dependencies associated with an entity.  With module applicativity,
we list module identities (not yet defined); with declaration applicativity
we actually list the original names (i.e., ids).

\begin{verbatim}
<deps>       ::= <id>, ...      # Declaration applicativity
<deps>       ::= <module>, ...  # Module applicativity
\end{verbatim}

Changing the \textbf{granularity of dependency} affects how we compute
the lists of dependencies, and what entities are well defined:

\begin{verbatim}
# Package-level granularity
<pkg-key>    ::= hash(<pkg-id> + <deps for pkg>)
<module>     ::= <pkg-key> : <mod-name>
<id>         ::= <module> . <occ>

# Module-level granularity
<pkg-key>    not defined
<module>     ::= hash(<pkg-id> : <mod-name> + <deps for mod>)
<id>         ::= <module-key> . <occ>

# Declaration-level granularity
<pkg-key>    not defined
<module>     not defined
<id>         ::= hash(<pkg-id> : <mod-name> . <occ> + <deps for decl>)
\end{verbatim}

Notice that as we increase the granularity, the notion of a ``package'' and a ``module''
become undefined.  This is because, for example, with module-level granularity, a single
``package'' may result in several modules, each of which have different sets of
dependencies.  It doesn't make much sense to refer to the package as a monolithic entity,
because the point of splitting up the dependencies was so that if a user relies only
on a single module, it has a correspondingly restricted set of dependencies.
\subsection{The new scheme, formally}

\begin{wrapfigure}{R}{0.5\textwidth}
\begin{myfig}
\[
\begin{array}{@{}lr@{\;}c@{\;}l@{}}
    \text{Package Names (\texttt{PkgName})} & P &\in& \mathit{PkgNames} \\
    \text{Module Path Names (\texttt{ModName})} & p &\in& \mathit{ModPaths} \\
    \text{Module Identity Vars} & \alpha,\beta &\in& \mathit{IdentVars} \\
    \text{Package Key (\texttt{PackageKey})} & \K &::=& P(\vec{p\mapsto\nu}) \\
    \text{Module Identities (\texttt{Module})} & \nu &::=&
      \alpha ~|~
      \K\colon\! p \\
    \text{Module Identity Substs} & \phi,\theta &::=&
      \{\vec{\alpha \coloneqq \nu}\} \\
\end{array}
\]
\caption{Module Identities}
\label{fig:mod-idents}
\end{myfig}
\end{wrapfigure}

In this section, we give a formal treatment of our choice in the design space, in the
same style as the Backpack paper, but omitting mutual recursion, as it follows straightforwardly.
Physical module
identities $\nu$, the \texttt{Module} component of \emph{original names} in GHC, are either (1) \emph{variables} $\alpha$, which are
used to represent holes\footnote{In practice, these will just be fresh paths in a special package key for variables.} or (2) a concrete module $p$ defined in package
$P$, with holes instantiated with other module identities (might be
empty)\footnote{In Paper Backpack, we would refer to just $P$:$p$ as the identity
constructor.  However, we've written the subterms specifically next to $P$ to highlight the semantic difference of these terms.}.

As in traditional Haskell, every package contains a number of module
files at some module path $p$; within a package these paths are
guaranteed to be unique.\footnote{In Paper Backpack, the module expressions themselves are used to refer to globally unique identifiers for each literal.  This makes the metatheory simpler, but for implementation purposes it is convenient to conflate the \emph{original} module path that a module is defined at with its physical identity.}  When we write inline module definitions, we assume
that they are immediately assigned to a module path $p$ which is incorporated
into their identity.  A module identity $\nu$ simply augments this
with subterms $\vec{p\mapsto\nu}$ representing how \emph{all} holes in the package $P$
were instantiated.\footnote{In Paper Backpack, we do not distinguish between holes/non-holes, and we consider all imports of the \emph{module}, not the package.}  This naming is stable because the current Backpack surface syntax does not allow a logical path in a package
to be undefined.  A package key is $P(\vec{p\mapsto\nu})$.

Here is the very first example from
Section 2 of the original Backpack paper, \pname{ab-1}:

\begin{example}
\Pdef{ab-1}{
    \Pmod{A}{x = True}
    \Pmod{B}{\Mimp{A}; y = not x}
    % \Pmodd{C}{\mname{A}}
}
\end{example}

The identities of \m{A} and \m{B} are
\pname{ab-1}:\mname{A} and \pname{ab-1}:\mname{B}, respectively.\footnote{In Paper Backpack, the identity for \mname{B} records its import of \mname{A}, but since it is definite, this is strictly redundant.} In a package with holes, each
hole (within the package definition) gets a fresh variable as its
identity, and all of the holes associated with package $P$ are recorded. Consider \pname{abcd-holes-1}:

\begin{example}
\Pdef{abcd-holes-1}{
    \Psig{A}{x :: Bool} % chktex 26
    \Psig{B}{y :: Bool} % chktex 26
    \Pmod{C}{x = False}
    \Pmodbig{D}{
        \Mimpq{A}\\
        \Mimpq{C}\\
        % \Mexp{\m{A}.x, z}\\
        z = \m{A}.x \&\& \m{C}.x
    }
}
\end{example}

The identities of the four modules
are, in order, $\alpha_a$, $\alpha_b$, $\pname{abcd-holes-1}(\alpha_a,\alpha_b)$:\mname{C}, and
$\pname{abcd-holes-1}(\alpha_a,\alpha_b)$:\mname{D}.\footnote{In Paper Backpack, the granularity is at the module level, so the subterms of \mname{C} and \mname{D} can differ.}  We include both $\alpha_a$ and $\alpha_b$ in both \mname{C} and \mname{D}, regardless of the imports.  When we link the package against an implementation of the hole, these variables are replaced with the identities of the modules we linked against.

Shaping proceeds in the same way as in Paper Backpack, except that the
shaping judgment must also accept the package key
$P(\vec{p\mapsto\alpha})$ so we can create identifiers with
\textsf{mkident}.  This implies we must know ahead of time what the holes
of a package are.

\paragraph{A full Backpack comparison}
If you're curious about how the rest of the Backpack examples translate,
look no further than this section.

First, consider the module identities in the \m{Graph} instantiations in
\pname{multinst}, shown in Figure 2 of the original Backpack paper.
In the definition of \pname{structures}, assume that the variables for
\m{Prelude} and \m{Array} are $\alpha_P$ and $\alpha_A$ respectively.
The identity of \m{Graph} is $\pname{structures}(\alpha_P, \alpha_A)$:\m{Graph}. Similarly, the identities of the two array implementations
are $\nu_{AA} = \pname{arrays-a}(\alpha_P)$:\m{Array} and
$\nu_{AB} = \pname{arrays-b}(\alpha_P)$:\m{Array}.\footnote{Notice that the subterms coincide with Paper Backpack!  A sign that module level granularity is not necessary for many use-cases.}

The package \pname{graph-a} is more interesting because it
\emph{links} the packages \pname{arrays-a} and \pname{structures}
together, with the implementation of \m{Array} from \pname{arrays-a}
\emph{instantiating} the hole \m{Array} from \pname{structures}.  This
linking is reflected in the identity of the \m{Graph} module in
\pname{graph-a}: whereas in \pname{structures} it was $\nu_G =
\pname{structures}(\alpha_P, \alpha_A)$:\m{Graph}, in \pname{graph-a} it is
$\nu_{GA} = \nu_G[\nu_{AA}/\alpha_A] = \pname{structures}(\alpha_P, \nu_{AA})$:\m{Graph}.  Similarly, the identity of \m{Graph} in
\pname{graph-b} is $\nu_{GB} = \nu_G[\nu_{AB}/\alpha_A] =
\pname{structures}(\alpha_P, \nu_{AB})$:\m{Graph}.  Thus, linking consists
of substituting the variable identity of a hole by the concrete
identity of the module filling that hole.

Lastly, \pname{multinst} makes use of both of these \m{Graph}
modules, under the aliases \m{GA} and \m{GB}, respectively.
Consequently, in the \m{Client} module, \code{\m{GA}.G} and
\code{\m{GB}.G} will be correctly viewed as distinct types since they
originate in modules with distinct identities.

As \pname{multinst} illustrates, module identities effectively encode
dependency graphs at the package level.\footnote{In Paper Backpack, module identities
encode dependency graphs at the module level.  In both cases, however, what is being
depended on is always a module.}  Like in Paper Backpack, we have an \emph{applicative}
semantics of instantiation, and the applicativity example in Figure 3 of the
Backpack paper still type checks.  However, because we are operating at a coarser
granularity, modules may have spurious dependencies on holes that they don't
actually depend on, which means less type equalities may hold.


\subsection{Cabal dependency resolution}

Currently, when we compile a Cabal
package, Cabal goes ahead and resolves \verb|build-depends| entries with actual
implementations, which we compile against.  The package key,
independently of Backpack, records the transitive dependency tree selected
during this dependency resolution process, so that we can install \pname{libfoo-1.0}
twice compiled against different versions of its dependencies.
What is the relationship to this transitive dependency tree of \emph{packages},
with the subterms of our package identities which are \emph{modules}?  Does one
subsume the other?  In fact, these are separate mechanisms---two levels of indirections,
so to speak.

To illustrate, suppose I write a Cabal file with \verb|build-depends: foobar|.  A reasonable assumption is that this translates into a
Backpack package which has \verb|include foobar|.  However, this is not
actually a Paper Backpack package: Cabal's dependency solver has to
rewrite all of these package references into versioned references
\verb|include foobar-0.1|.  For example, this is a pre-package:

\begin{verbatim}
package foo where
    include bar
\end{verbatim}

and this is a Paper Backpack package:

\begin{verbatim}
package foo-0.3[bar-0.1[baz-0.2]] where
    include bar-0.1[baz-0.2]
\end{verbatim}

This tree is very similar to the one tracking dependencies for holes,
but we must record this tree \emph{even} when our package has no holes.
%   As a final example, the full module
%   identity of \m{B1} in Figure~\ref{fig:granularity} may actually be $\pname{p-0.9(q-1.0[p-0.9]:A1)}$:\m{B}.

\paragraph{Linker symbols} As we increase the amount of information in
PackageId, it's important to be careful about the length of these IDs,
as they are used for exported linker symbols (e.g.
\verb|base_TextziReadziLex_zdwvalDig_info|).  Very long symbol names
hurt compile and link time, object file sizes, GHCi startup time,
dynamic linking, and make gdb hard to use.
As such, we've done away with full package names and versions; instead,
there is simply a base-62 encoded hash, with the first five characters of the package
name for user-friendliness.

\subsection{Package selection}

When I fire up \texttt{ghci} with no arguments, GHC somehow creates
out of thin air some consistent set of packages, whose modules I can
load using \texttt{:m}.  This functionality is extremely handy for
exploratory work, but actually GHC has to work quite hard in order
to generate this set of packages, the contents of which are all
dumped into a global namespace.  For example, GHC doesn't have access
to Cabal's dependency solver, nor does it know \emph{which} packages
the user is going to ask for, so it can't just run a constraint solver,
get a set of consistent packages to offer and provide them to the user.\footnote{Some might
argue that depending on a global environment in this fashion is wrong, because
when you perform a build in this way, you have absolutely no ideas what
dependencies you actually ended up using.  But the fact remains that for
end users, this functionality is very useful.}

To make matters worse, while in the current design of the package database,
a package is uniquely identified by its package name and version, in
the Backpack design, it is \emph{mandatory} that we support multiple
packages installed in the database with the same package name and version,
and this can result in complications in the user model.  This further
complicates GHC's default package selection algorithm.

In this section, we describe how the current algorithm operates (including
what invariants it tries to uphold and where it goes wrong), and how
to replace the algorithm to handle generalization to
multiple instances in the package database.  We'll also try to tease
apart the relationship between package keys and installed package IDs in
the database.

\paragraph{The current algorithm} Abstractly, GHC's current package
selection algorithm operates as follows.  For every package name, select
the package with the latest version (recall that this is unique) which
is also \emph{valid}.  A package is valid if:

\begin{itemize}
    \item It exists in the package database,
    \item All of its dependencies are valid,
    \item It is not shadowed by a package with the same package ID\footnote{Recall that currently, a package ID uniquely identifies a package in the package database} in
        another package database (unless it is in the transitive closure
        of a package named by \texttt{-package-id}), and
    \item It is not ignored with \texttt{-ignore-package}.
\end{itemize}

Package validity is probably the minimal criterion for to GHC to ensure
that it can actually \emph{use} a package.  If the package is missing,
GHC can't find the interface files or object code associated with the
package.  Ignoring packages is a way of pretending that a package is
missing from the database.

Package validity is also a very weak criterion.  Another criterion we
might hope holds is \emph{consistency}: when we consider the transitive
closure of all selected packages, for any given package name, there
should only be one instance providing that package.  It is trivially
easy to break this property: suppose that I have packages \pname{a-1.0},
\pname{b-1.0} compiled against \pname{a-1.0}, and \pname{a-1.1}.  GHC
will happily load \pname{b-1.0} and \pname{a-1.1} together in the same
interactive session (they are both valid and the latest versions), even
though \pname{b-1.0}'s dependency is inconsistent with another package
that was loaded.  The user will notice if they attempt to treat entities
from \pname{a} reexported by \pname{b-1.0} and entities from
\pname{a-1.1} as type equal.  Here is one user who had this problem:
\url{http://stackoverflow.com/questions/12576817/}.  In some cases, the
problem is easy to work around (there is only one offending package
which just needs to be hidden), but if the divergence is deep in two
separate dependency hierarchies, it is often easier to just blow away
the package database and try again.

Perversely, destructive reinstallation helps prevent these sorts of
inconsistent databases.  While inconsistencies can arise when multiple
versions of a package are installed, multiple versions will frequently
lead to the necessity of reinstalls.  In the previous example, if a user
attempts to Cabal install a package which depends on \pname{a-1.1} and
\pname{b-1.0}, Cabal's dependency solver will propose reinstalling
\pname{b-1.0} compiled against \pname{a-1.1}, in order to get a
consistent set of dependencies.  If this reinstall is accepted, we
invalidate all packages in the database which were previously installed
against \pname{b-1.0} and \pname{a-1.0}, excluding them from GHC's
selection process and making it more likely that the user will see a
consistent view of the database.

\paragraph{Enforcing consistent dependencies}  From the user's
perspective, it would be desirable if GHC never loaded a set of packages
whose dependencies were inconsistent.
There are two ways we can go
about doing this.  First, we can improve GHC's logic so that it doesn't
pick an inconsistent set.  However, as a point of design, we'd like to
keep whatever resolution GHC does as simple as possible (in an ideal
world, we'd skip the validity checks entirely, but they ended up being
necessary to prevent broken database from stopping GHC from starting up
at all). In particular, GHC should \emph{not} learn how to do
backtracking constraint solving: that's in the domain of Cabal.  Second,
we can modify the logic of Cabal to enforce that the package database is
always kept in a consistent state, similar to the consistency check
Cabal applies to sandboxes, where it refuses to install a package to a
sandbox if the resulting dependencies would not be consistent.

The second alternative is a appealing, but Cabal sandboxes are currently
designed for small, self-contained single projects, as opposed to the
global ``universe'' that a default environment is intended to provide.
For example, with a Cabal sandbox environment, it's impossible to
upgrade a dependency to a new version without blowing away the sandbox
and starting again.  To support upgrades, Cabal needs to do some work:
when a new version is put in the default set, all of the
reverse-dependencies of the old version are now inconsistent.  Cabal
should offer to hide these packages or reinstall them compiled against
the latest version.  Furthermore, because we in general may not have write
access to all visible package databases, this visibility information
must be independent of the package databases themselves.

As a nice bonus, Cabal should also be able to snapshot the older
environment which captures the state of the universe prior to the
installation, in case the user wants to revert back.

\paragraph{Modifying the default environment}  Currently, after GHC
calculates the default package environment, a user may further modify
the environment by passing package flags to GHC, which can be used to
explicitly hide or expose packages.  How do these flags interact with
our Cabal-managed environments?  Hiding packages is simple enough,
but exposing packages is a bit dicier.  If a user asks for a different
version of a package than in the default set, it will probably be
inconsistent with the rest of the dependencies.  Cabal would have to
be consulted to figure out a maximal set of consistent packages with
the constraints given.  Alternatively, we could just supply the package
with no claims of consistency.

However, this use-case is rare.  Usually, it's not because they want a
specific version: the package is hidden simply because we're not
interested in loading it by default (\pname{ghc-api} is the canonical
example, since it dumps a lot of modules in the top level namespace).
If we distinguish packages which are consistent but hidden, their
loads can be handled appropriately.

\paragraph{Consistency in Backpack} We have stated as an implicit
assumption that if we have both \pname{foo-1.0} and \pname{foo-1.1}
available, only one should be loaded at a time.  What are the
consequences if both of these packages are loaded at the same time?  An
import of \m{Data.Foo} provided by both packages would be ambiguous and
the user might find some type equalities they expect to hold would not.
However, the result is not \emph{unsound}: indeed, we might imagine a
user purposely wanting two different versions of a library in the same
program, renaming the modules they provided so that they could be
referred to unambiguously.  As another example, suppose that we have an
indefinite package with a hole that is instantiated multiple times.  In
this case, a user absolutely may want to refer to both instantiations,
once again renaming modules so that they have unique names.

There are two consequences of this.  First, while the default package
set may enforce consistency, a user should still be able to explicitly
ask for a package instance, renamed so that its modules don't conflict,
and then use it in their program.  Second, instantiated indefinite packages
should \emph{never} be placed in the default set, since it's impossible
to know which instantiation is the one the user prefers.  A definite package
can reexport an instantiated module under an unambiguous name if the user
so pleases.

\paragraph{Shadowing, installed package IDs, ABI hashes and package
keys} Shadowing plays an important role for maintaining the soundness of
compilation; call this the \emph{compatibility} of the package set.  The
problem it addresses is when there are two distinct implementations of a
module, but because their package ID (or package key, in the new world
order) are the same, they are considered type equal.  It is absolutely
wrong for a single program to include both implementations
simultaneously (the symbols would conflict and GHC would incorrectly
conclude things were type equal when they're not), so \emph{shadowing}'s
job is to ensure that only one instance is picked, and all the other
instances considered invalid (and their reverse-dependencies, etc.)
Recall that in current GHC, within a package database, a package
instance is uniquely identified by its package ID\@; thus, shadowing
only needs to take place between package databases.  An interesting
corner case is when the same package ID occurs in both databases, but
the installed package IDs are the \emph{same}.  Because the installed
package ID is currently simply an ABI hash, we skip shadowing, because
the packages are---in principle---interchangeable.

There are currently a number of proposed changes to this state of affairs:

\begin{itemize}
    \item Change installed package IDs to not be based on ABI hashes.
        ABI hashes have a number of disadvantages as identifiers for
        packages in the database.  First, they cannot be computed until
        after compilation, which gave the multi-instance GSoC project a
        few years some headaches.  Second, it's not really true that
        programs with identical ABI hashes are interchangeable: a new
        package may be ABI compatible but have different semantics.
        Thus, installed package IDs are a poor unique identifier for
        packages in the package database.  However, because GHC does not
        give ABI stability guarantees, it would not be possible to
        assume from here that packages with the same installed package
        ID are ABI compatible.

    \item Relaxing the uniqueness constraint on package IDs.  There are
        actually two things that could be done here.  First, since we
        have augmented package IDs with dependency resolution
        information to form package keys, we could simply state that
        package keys uniquely identify a package in a database.
        Shadowing rules can be implemented in the same way as before, by
        preferring the instance topmost on the stack.  Second, we could
        also allow \emph{same-database} shadowing: that is, not even
        package keys are guaranteed to be unique in a database: instead,
        installed package IDs are the sole unique identifier of a
        package.  This architecture is Nix inspired, as the intent is
        to keep all package information in a centralized database.
\end{itemize}

Without mandatory package environments, same-database shadowing is a bad
idea, because GHC now has no idea how to resolve shadowing.  Conflicting
installed package IDs can be simulated by placing them in multiple
package databases (in principle, the databases can be concatenated together
and treated as a single monolitic database.)

\section{Shapeless Backpack}\label{sec:simplifying-backpack}

Backpack as currently defined always requires a \emph{shaping} pass,
which calculates the shapes of all modules defined in a package.
The shaping pass is critical to the solution of the double-vision problem
in recursive module linking, but it also presents a number of unpalatable
implementation problems:

\begin{itemize}

    \item \emph{Shaping is a lot of work.} A module shape specifies the
        providence of all data types and identifiers defined by a
        module. To calculate this, we must preprocess and parse all
        modules, even before we do the type-checking pass.  (Fortunately,
        shaping doesn't require a full parse of a module, only enough
        to get identifiers.  However, it does have to understand import
        statements at the same level of detail as GHC's renamer.)

    \item \emph{Shaping must be done upfront.} In the current Backpack
        design, all shapes must be computed before any typechecking can
        occur.  While performing the shaping pass upfront is necessary
        in order to solve the double vision problem (where a module
        identity may be influenced by later definitions), it means
        that GHC must first do a shaping pass, and then revisit every module and
        compile them proper.  Nor is it (easily) possible to skip the
        shaping pass when it is unnecessary, as one might expect to be
        the case in the absence of mutual recursion.  Shaping is not
        a ``pay as you go'' language feature.

    \item \emph{GHC can't compile all programs shaping accepts.}  Shaping
        accepts programs that GHC, with its current hs-boot mechanism, cannot
        compile.  In particular, GHC requires that any data type or function
        in a signature actually be \emph{defined} in the module corresponding
        to that file (i.e., an original name can be assigned to these entities
        immediately.)  Shaping permits unrestricted exports to implement
        modules; this shows up in the formalism as $\beta$ module variables.

    \item \emph{Shaping encourages inefficient program organization.}
        Shaping is designed to enable mutually recursive modules, but as
        currently implemented, mutual recursion is less efficient than
        code without recursive dependencies. Programmers should avoid
        this code organization, except when it is absolutely necessary.

    \item \emph{GHC is architecturally ill-suited for directly
        implementing shaping.}  Shaping implies that GHC's internal
        concept of an ``original name'' be extended to accommodate
        module variables.  This is an extremely invasive change to all
        aspects of GHC, since the original names assumption is baked
        quite deeply into the compiler.  Plausible implementations of
        shaping requires all these variables to be skolemized outside
        of GHC\@.

\end{itemize}

To be clear, the shaping pass is fundamentally necessary for some
Backpack packages.  Here is the example which convinced Simon:

\begin{verbatim}
package p where
    A :: [data T; f :: T -> T]
    B = [export T(MkT), h; import A(f); data T = MkT; h x = f MkT]
    A = [export T(MkT), f, h; import B; f MkT = MkT]
\end{verbatim}

The key to this example is that B \emph{may or may not typecheck} depending
on the definition of A. Because A reexports B's definition T, B will
typecheck; but if A defined T on its own, B would not typecheck.  Thus,
we \emph{cannot} typecheck B until we have done some analysis of A (the
shaping analysis!)

Thus, it is beneficial (from an optimization point of view) to
consider a subset of Backpack for which shaping is not necessary.
Here is a programming discipline which does just that, which we will call the \textbf{linking restriction}: \emph{Module implementations must be declared before
signatures.}  Formally, this restriction modifies the rule for merging
polarized module shapes ($\widetilde{\tau}_1^{m_1} \oplus \widetilde{\tau}_2^{m_2}$) so that
$\widetilde{\tau}_1^- \oplus \widetilde{\tau}_2^+$ is always undefined.\footnote{This seemed to be the crispest way of defining the restriction, although this means an error happens a bit later than I'd like it to: I'd prefer if we errored while merging logical contexts, but we don't know what is a hole at that point.}

Here is an example of the linking restriction. Consider these two packages:

\begin{verbatim}
package random where
    System.Random = [ ... ].hs

package monte-carlo where
    System.Random :: ...
    System.MonteCarlo = [ ... ].hs
\end{verbatim}

Here, random is a definite package which may have been compiled ahead
of time; monte-carlo is an indefinite package with a dependency
on any package which provides \verb|System.Random|.

Now, to link these two applications together, only one ordering
is permissible:

\begin{verbatim}
package myapp where
    include random
    include monte-carlo
\end{verbatim}

If myapp wants to provide its own random implementation, it can do so:

\begin{verbatim}
package myapp2 where
    System.Random = [ ... ].hs
    include monte-carlo
\end{verbatim}

In both cases, all of \verb|monte-carlo|'s holes have been filled in by the time
it is included.  The alternate ordering is not allowed.

Why does this discipline prevent mutually recursive modules?  Intuitively,
a hole is the mechanism by which we can refer to an implementation
before it is defined; otherwise, we can only refer to definitions which
preceed our definition. If there are never any holes \emph{which get filled},
implementation links can only go backwards, ruling out circularity.

It's easy to see how mutual recursion can occur if we break this discipline:

\begin{verbatim}
package myapp2 where
    include monte-carlo
    System.Random = [ import System.MonteCarlo ].hs
\end{verbatim}

\subsection{Typechecking of definite modules without shaping}

If we are not carrying out a shaping pass, we need to be able to calculate
$\widetilde{\Xi}_{\mathsf{pkg}}$ on the fly.  In the case that we are
compiling a package---there will be no holes in the final package---we
can show that shaping is unnecessary quite easily, since with the
linking restriction, everything is definite from the get-go.

Observe the following invariant: at any given step of the module
bindings, the physical context $\widetilde{\Phi}$ contains no
holes.  We can thus conclude that there are no module variables in any
type shapes.  As the only time a previously calculated package shape can
change is due to unification, the incrementally computed shape is in
fact the true one.

As far as the implementation is concerned, we never have to worry
about handling module variables; we only need to do extra typechecks
against (renamed) interface files.

\subsection{Compiling definite packages}\label{sec:compiling}

% New definitions
\algnewcommand\algorithmicswitch{\textbf{switch}}
\algnewcommand\algorithmiccase{\textbf{case}}
\algnewcommand\algorithmicassert{\texttt{assert}}
% New "environments"
\algdef{SE}[SWITCH]{Switch}{EndSwitch}[1]{\algorithmicswitch\ #1\ \algorithmicdo}{\algorithmicend\ \algorithmicswitch}%
\algdef{SE}[CASE]{Case}{EndCase}[1]{\algorithmiccase\ ``#1''}{\algorithmicend\ \algorithmiccase}%
\algtext*{EndSwitch}%
\algtext*{EndCase}%

\begin{algorithm}
    \caption{Compilation of definite packages (assume \texttt{-hide-all-packages} on all \texttt{ghc} invocations)}\label{alg:compile}
\begin{algorithmic}
    \Procedure{Compile}{\textbf{package} $P$ \textbf{where} $\vec{B}$, $H$, $db$}\Comment{}$H$ maps hole module names to identities
    \State$flags\gets \nil$
    \State$\mathcal{K}$ $\gets$ \Call{Hash}{$P + H$}
    \State%
    In-place register the package $\mathcal{K}$ in $db$
    \For{$B$ \textbf{in} $\vec{B}$}
        \Case{$p = p\texttt{.hs}$}
        \State\Call{Exec}{\texttt{ghc -c} $p$\texttt{.hs} \texttt{-package-db} $db$ \texttt{-this-package-key} $\mathcal{K}$ $flags$}
        \EndCase%
        \Case{$p$ $\cc$ $p$\texttt{.hsig}}
            \State\Call{Exec}{\texttt{ghc -c} $p$\texttt{.hsig} \texttt{-package-db} $db$ \texttt{-sig-of} $H(p)$ $flags$}
        \EndCase%
        \Case{$p = p'$}
            \State$flags\gets flags$ \texttt{-alias} $p$ $p'$
        \EndCase%
        \Case{\Cinc{$P'$} $\langle\vec{p_H\mapsto p_H'}, \vec{p\mapsto p'} \rangle$}
            \State\textbf{let} $H'(p_H) = $ \Call{ResolveModule}{$p_H'$}
            \State$\mathcal{K}'\gets$ \Call{Compile}{$P'$, $H'$, $db$}\Comment{}Nota bene: not $flags$
            \State$flags\gets flags$ \texttt{-package} $\mathcal{K}'$ $\langle\vec{p\mapsto p'}\rangle$
        \EndCase%
    \EndFor%
    \State%
    Remove $\mathcal{K}$ from $db$
    \State%
    Install the complete package $\mathcal{K}$ to the global database
    \State\Return$\mathcal{K}$
    \EndProcedure%
\end{algorithmic}
\end{algorithm}

The full recursive procedure for compiling a Backpack package using
one-shot compilation is given in Figure~\ref{alg:compile}.  We
recursively walk through Backpack descriptions, processing each line by
invoking GHC and/or modifying our package state.  Here is a more
in-depth description of the algorithm, line-by-line:

\paragraph{The parameters} To compile a package description for package
$P$, we need to know $H$, the mapping of holes $p_H$ in package $P$ to
physical module identities $\nu$ which are implementing them; this
mapping is used to calculate the package key $\mathcal{K}$ for the
package in question.  Furthermore, we have an inplace package database
$db$ in which we will register intermediate build results, including
partially compiled parent packages which may provide implementations
of holes for packages they include.

\subsection{Compiling implementations}

We compile modules in the same way we do today, but with some extra
package visibility $flags$, which let GHC know how to resolve imports
and look up original names.  We'll describe what the new flags are
and also discuss some subtleties with module lookup.

\paragraph{In-place registration}  Perhaps surprisingly, we start
compilation by registering the (uncompiled) package in the in-place
package database.  This registration does not expose packages, and is
purely intended to inform the compilation of subpackages where to
find modules that are provided by the parent (in-progress) package,
as well as provide auxiliary information, e.g., such as the package name
and version for error reporting.  The pre-registration trick is an old
one used by the GHC build system; the key invariant to look out for
is that we shouldn't reference original names in modules that haven't
been built yet.  This is enforced by our manual tracking of holes in
$H$: a module can't occur in $H$ unless it's already been compiled!

\paragraph{New package resolution algorithm}  Currently, invocations
of \texttt{-package} and similar flags have the result of \emph{hiding}
other exposed packages with the same name.  However, this is not going
to work for Backpack: an indefinite package may get loaded multiple times
with different instantiations, and it might even make sense to load multiple
versions of the same package simultaneously, as long as their modules
are renamed to not conflict.

Thus, we impose the following behavior change: when
\texttt{-hide-all-packages} is specified, we do \emph{not} automatically
hide packages with the same name as a package specified by
\texttt{-package} (or a similar flag): they are all included, even if
there are conflicts.  To deal with conflicts, we augment the syntax of
\texttt{-package} to also accept a list of thinnings and renamings, e.g.
\texttt{-package} \pname{containers} $\langle\m{Data.Set},
\m{Data.Map}\mapsto \m{Map}\rangle$ says to make visible for import
\m{Data.Set} and \m{Map} (which is \m{Data.Map} renamed.)  This means
that
\texttt{-package} \pname{containers-0.9} $\langle\m{Data.Set}\mapsto
\m{Set09}\rangle$ \texttt{-package} \pname{containers-0.8}
$\langle\m{Data.Set}\mapsto \m{Set08}\rangle$ now uses both packages
concurrently (previously, GHC would hide one of them.)

Additionally, it's important to note that two packages exporting the
same module do not \emph{necessarily} cause a conflict; the modules
may be linkable.  For example, \texttt{-package} \pname{containers} $\langle\m{Data.Set}\rangle$
\texttt{-package} \pname{containers} $\langle\m{Data.Set}\rangle$ is fine, because
precisely the same implementation of \m{Data.Set} is loaded in both cases.
A similar situation can occur with signatures:

\begin{verbatim}
package p where
    A :: [ x :: Int ]
package q
    include p
    A :: [ y :: Int ]
    B = [ import A; z = x + y ] -- *
package r where
    A = [ x = 0; y = 0 ]
    include q
\end{verbatim}

Here, both \pname{p} and \pname{q} are visible when compiling the starred
module, which compiles with the flags \texttt{-package} \pname{p}, but there
are two interface files available: one available locally, and one from \pname{p}.
Both of these interface files are \emph{forwarding} to the original implementation
\pname{r} (more on this in the ``Compiling signatures'' file), so rather than
reporting an ambiguous import, we instead have to merge the two interface files
together. This is done by simulating multiple imports: one to each interface
file. This works because GHC does not consider symbols with equal original names
as conflicting.

Note that we do not need to merge signatures with an implementation, in such
cases, we should just use the implementation interface.  E.g.

\begin{verbatim}
package p where
    A :: ...
package q where
    A = ...
    include p
    B = [ import A ]    -- *
\end{verbatim}

Here, \m{A} is available both from \pname{p} and \pname{q}, but the use in the
starred module should be done with respect to the full implementation.

\paragraph{The \texttt{-alias} flag}  We introduce a new flag
\texttt{-alias} for aliasing modules.  Aliasing is analogous to
the merging that can occur when we include packages, but it also applies
to modules which are locally defined.  When we alias a module $p$ with
$p'$, we require that $p'$ exists in the current module mapping, and then
we attempt to add an entry for it at entry $p$.  If there is no mapping for
$p$, this succeeds; otherwise, we apply the same conflict resolution algorithm.

\subsection{Compiling signatures}

Signature compilation is triggered when we compile a signature file.
This mode similar to how we process \verb|hs-boot| files, except
we pass an extra flag \verb|-sig-of| which specifies what the
identity of the actual implementation of the signature is (according to our $H$
mapping).  This is guaranteed to exist, due to the linking
restriction, although it may be in a partially registered package
in $db$.  If the module is \emph{not} currently available under the same name of the
\texttt{hsig} file, we output an \texttt{hi} file which, for all declarations the
signature exposes, forwards their definitions to the original
implementation file.  The intent is that any code in the current package
which compiles against this signature will use this signature \texttt{hi} file,
not the original one \texttt{hi} file.
For example, the \texttt{hi} file produced when compiling the starred interface
points to the implementation in package \pname{q}.

\begin{verbatim}
package p where
    A :: ...    -- *
    B = [ import A; ... ]
package q where
    A = [ ... ]
    include p
\end{verbatim}

\paragraph{Sometimes \texttt{hi} is unnecessary}
In the following package:

\begin{verbatim}
package p where
    P = ...
    P :: ...
\end{verbatim}

Paper Backpack specifies that we check the signature \m{P} against implementation
\m{P}, but otherwise no changes are made (i.e., the signature does not narrow
the implementation.) In this case, it is not necessary to generate an \texttt{hi} file;
the original interface file suffices.

\paragraph{Multiple signatures}  As a simplification, we assume that there
is only one signature per logical name in a package.  (This prevents
us from expressing mutual recursion in signatures, but let's not worry
about it for now.)

\paragraph{Restricted recursive modules ala hs-boot}\label{sec:hs-boot-restrict}
When we compile an \texttt{hsig} file without any \texttt{-sig-of} flag (because
no implementation is known), we fall back to old-style GHC mutual recursion.
Na\"\i vely, a shaping pass would be necessary;
so we adopt an existing constraint that
already applies to hs-boot files: \emph{at the time we define a signature,
we must know what the original name for all data types is}.  In practice,
GHC enforces this by stating that: (1) an hs-boot file must be
accompanied with an implementation, and (2) the implementation must
in fact define (and not reexport) all of the declarations in the signature.
We can discover if a signature is intended to break a recursive module loop
when we discover that $p\notin flags_H$; in this case, we fallback to the
old hs-boot behavior. (Alternatively, the user can explicitly ask for it.)

Why does this not require a shaping pass? The reason is that the
signature is not really polymorphic: we require that the $\alpha$ module
variable be resolved to a concrete module later in the same package, and
that all the $\beta$ module variables be unified with $\alpha$. Thus, we
know ahead of time the original names and don't need to deal with any
renaming.\footnote{This strategy doesn't completely resolve the problem
of cross-package mutual recursion, because we need to first compile a
bit of the first package (signatures), then the second package, and then
the rest of the first package.}

Compiling packages in this way gives the tantalizing possibility
of true separate compilation: the only thing we don't know is what the actual
package name of an indefinite package will be, and what the correct references
to have are.  This is a very minor change to the assembly, so one could conceive
of dynamically rewriting these references at the linking stage.  But
separate compilation achieved in this fashion would not be able to take
advantage of cross-module optimizations.

\subsection{Compiling includes}

Includes are the most interesting part of the compilation process, as we have
calculate how the holes of the subpackage we are filling in are compiled $H'$
and modify our flags to make the exports of the include visible to subsequently
compiled modules.  We consider the case with renaming, since includes with
no renaming are straightforward.

First, we assume that we know \emph{a priori} what the holes of a
package $p_H$ are (either by some sort of pre-pass, or explicit
declaration.)  For each of their \emph{renamed targets} $p'_H$, we determine
what the original module associated with the $p'_H$ is, based off of
the package database that we have been manipulating.
For example:

\begin{verbatim}
package p where
    A :: ...
    ...
package q where
    A = [ ... ]
    B = [ ... ]
    include p (A as B)
\end{verbatim}

When computing the entry $H(\pname{A})$, we determine what the original
module for \pname{B} is.

Next, we recursively call \textsc{Compile} with the computed $H'$.
Note that the entries in $H$ may refer to modules which would not be
picked up by $flags$, but they will be registered in the inplace
package database $db$.
For example, in this situation:

\begin{verbatim}
package p where
    B :: ...
    C = [ import B; ... ]
package q where
    A = [ ... ]
    B = [ import A; ... ]
    include p
    D = [ import C; ... ]
\end{verbatim}

When we recursively process package \pname{p}, $H$ will refer to
\pname{q}:\m{B}, and we need to know where to find it (\pname{q} is only
partially processed and so is in the inplace package database.)
Furthermore, the interface file for \m{B} may refer to \pname{q}:\m{A},
and thus we likewise need to know how to find its interface file.

Note that the inplace package database is not expected to expose intermediate
packages. Otherwise, this example would improperly compile:

\begin{verbatim}
package p where
    B = [ import A; ... ]
package q where
    A = ...
    include p
\end{verbatim}

\pname{p} does not compile on its own, so it should not compile if it is
recursively invoked from \pname{q}.  However, if we exposed the modules
of the partially registered package \pname{q}, \m{A} is now suddenly
resolvable.

Finally, once the subpackage is compiled, we can add it to our $flags$ so later
modules we compile see its (appropriately thinned and renamed) modules, and like
aliasing.

\paragraph{Absence of an \texttt{hi} file}
It is important that when we resolve a module, we look up the \emph{implementor}
of a module, and not just a signature which is providing it at some name.
Sometimes, it can be a bit indirect, for example:

\begin{verbatim}
package p where
    A :: [ y :: Int ]
package q where
    A :: [ x :: Int ]
    include p -- *
package r where
    A = [ x = 0; y = 1 ]
    include q
\end{verbatim}

When computing $H'$ for the starred include, our $flags$ only include
\texttt{-package-dir} \pname{r} $cwd_r$ $\langle\rangle$: with a thinning
that excludes all modules!  The only interface file we can pick up with these
$flags$ is the local definition of \m{A}.  However, we \emph{absolutely}
should set $H'(\m{A})=\pname{q}:\m{A}$; if we do so, then we will incorrectly
conclude when compiling the signature in \pname{p} that the implementation
doesn't export enough identifiers to fulfill the signature (\texttt{y} is not
available from just the signature in \pname{q}).  Instead, we have to look
up the original implementor of \m{A} in \pname{r}, and use that in $H'$.
If you maintain the invariant that you always know what the original implementor
is of all modules in scope, it's easy enough to figure this out.

\subsection{Commentary}

\paragraph{Just because it compiled, doesn't mean the individual packages type check}
The compilation mechanism described is slightly more permissive than vanilla Backpack.
Here is a simple example:

\begin{verbatim}
package p where
    A :: [ data T = T ]
    B :: [ data T = T ]
    C = [
        import A
        import B
        x = A.T :: B.T
    ]
package q where
    A = [ data T = T ]
    B = A
    include p
\end{verbatim}

Here, we incorrectly decide \m{A}\verb|.T| and \m{B}\verb|.T| are type
equal when typechecking \m{C}, because the \verb|hisig| files we
generate for them all point to the same original implementation.  However,
\pname{p} should not typecheck.

The problem here is that type checking asks ``does it compile with respect
to all possible instantiations of the holes'', whereas compilation asks
``does it compile with respect to this particular instantiation of holes.''
In the absence of a shaping pass, this problem is unavoidable.

\section{Shaped Backpack}

Despite the simplicity of shapeless Backpack with the linking
restriction in the absence of holes, we will find that when we have
holes, it will be very difficult to do type-checking without
some form of shaping.  This section is very much a work in progress,
but the ability to typecheck against holes, even with the linking restriction,
is a very important part of modular separate development, so we will need
to support it at some point.

\subsection{Efficient shaping}

(These are Edward's opinion, he hasn't convinced other folks that this is
the right way to do it.)

In this section, I want to argue that, although shaping constitutes
a pre-pass which must be run before compilation in earnest, it is only
about as bad as the dependency resolution analysis that GHC already does
in \verb|ghc -M| or \verb|ghc --make|.

In Paper Backpack, what information does shaping compute? It looks at
exports, imports, data declarations and value declarations (but not the
actual expressions associated with these values.)  As a matter of fact,
GHC already must look at the imports associated with a package in order
to determine the dependency graph, so that it can have some order to compile
modules in.  There is a specialized parser which just parses these statements,
and then ignores the rest of the file.

A bit of background: the \emph{renamer} is responsible for resolving
imports and figuring out where all of these entities actually come from.
SPJ would really like to avoid having to run the renamer in order to perform
a shaping pass.

\paragraph{Is it necessary to run the Renamer to do shaping?}
Edward and Scott believe the answer is no, well, partially.
Shaping needs to know the original names of all entities exposed by a
module/signature. Then it needs to know (a) which entities a module/signature
defines/declares locally and (b) which entities that module/signature exports.
The former, (a), can be determined by a straightforward inspection of a parse
tree of the source file.\footnote{Note that no expression or type parsing
is necessary. We only need names of local values, data types, and data
constructors.} The latter, (b), is a bit trickier. Right now it's the Renamer
that interprets imports and exports into original names, so we would still
rely on that implementation. However, the Renamer does other, harder things
that we don't need, so ideally we could factor out the import/export
resolution from the Renamer for use in shaping.

Unfortunately the Renamer's import resolution analyzes \verb|.hi| files, but for
local modules, which haven't yet been typechecked, we don't have those.
Instead, we could use a new file format, \verb|.hsi| files, to store the shape of
a locally defined module. (Defined packages are bundled with their shapes,
so included modules have \verb|.hsi| files as well.) (What about the logical
vs.~physical distinction in file names?) If we refactor the import/export
resolution code, could we rewrite it to generically operate on both
\verb|.hi| files and \verb|.hsi| files?

Alternatively, rather than storing shapes on a per-source basis, we could
store (in memory) the entire package shape. Similarly, included packages
could have a single shape file for the entire package. Although this approach
would make shaping non-incremental, since an entire package's shape would
be recomputed any time a constituent module's shape changes, we do not expect
shaping to be all that expensive.

\subsection{Typechecking of indefinite modules}\label{sec:typechecking-indefinite}

Recall in our argument in the definite case, where we showed there are
no holes in the physical context.  With indefinite modules, this is no
longer true. While (with the linking restriction) these holes will never
be linked against a physical implementation, they may be linked against
other signatures.  (Note: while disallowing signature linking would
solve our problem, it would disallow a wide array of useful instances of
signature reuse, for example, a package mylib that implements both
mylib-1x-sig and mylib-2x-sig.)

With holes, we must handle module variables, and we sometimes must unify them:

\begin{verbatim}
package p where
    A :: [ data A ]
package q where
    A :: [ data A ]
package r where
    include p
    include q
\end{verbatim}

In this package, it is not possible to a priori assign original names to
module A in p and q, because in package r, they should have the same
original name.  When signature linking occurs, unification may occur,
which means we have to rename all relevant original names. (A similar
situation occurs when a module is typechecked against a signature.)

An invariant which would be nice to have is this: when typechecking a
signature or including a package, we may apply renaming to the entities
being brought into context.  But once we've picked an original name for
our entities, no further renaming should be necessary. (Formally, in the
unification for semantic object shapes, apply the unifier to the second
shape, but not the first one.)

However, there are plenty of counterexamples here:

\begin{verbatim}
package p where
    A :: [ data A ]
    B :: [ data A ]
    M = ...
    A = B
\end{verbatim}

In this package, does module M know that A.A and B.A are type equal?  In
fact, the shaping pass will have assigned equal module identities to A
and B, so M \emph{equates these types}, despite the aliasing occurring
after the fact.

We can make this example more sophisticated, by having a later
subpackage which causes the aliasing; now, the decision is not even a
local one (on the other hand, the equality should be evident by inspection
of the package interface associated with q):

\begin{verbatim}
package p where
    A :: [ data A ]
    B :: [ data A ]
package q where
    A :: [ data A ]
    B = A
package r where
    include p
    include q
\end{verbatim}

Another possibility is that it might be acceptable to do a mini-shaping
pass, without parsing modules or signatures, \emph{simply} looking at
names and aliases.  But logical names are not the only mechanism by
which unification may occur:

\begin{verbatim}
package p where
    C :: [ data A ]
    A = [ data A = A ]
    B :: [ import A(A) ]
    C = B
\end{verbatim}

It is easy to conclude that the original names of C and B are the same.  But
more importantly, C.A must be given the original name of p:A.A.  This can only
be discovered by looking at the signature definition for B. In any case, it
is worth noting that this situation parallels the situation with hs-boot
files (although there is no mutual recursion here).

The conclusion is that you will probably, in fact, have to do real
shaping in order to typecheck all of these examples.

\paragraph{Hey, these signature imports are kind of tricky\ldots}

When signatures and modules are interleaved, the interaction can be
complex.  Here is an example:

\begin{verbatim}
package p where
    C :: [ data A ]
    M = [ import C; ... ]
    A = [ import M; data A = A ]
    C :: [ import A(A) ]
\end{verbatim}

Here, the second signature for C refers to a module implementation A
(this is permissible: it simply means that the original name for p:C.A
is p:A.A).  But wait! A relies on M, and M relies on C. Circularity?
Fortunately not: a client of package p will find it impossible to have
the hole C implemented in advance, since they will need to get their hands on module
A\ldots but it will not be defined prior to package p.

In any case, however, it would be good to emit a warning if a package
cannot be compiled without mutual recursion.

\subsection{Rename on entry}

Consider the following example:

\begin{verbatim}
package p where
    A :: [ data T = T ]
    B = [ import A; x = T ]
package q where
    C :: ...
    A = [ data T = T ]
    include p
    D = [
        import qualified A
        import qualified B
        import C
        x = B.T :: A.T
    ]
\end{verbatim}

We are interested in type-checking \pname{q}, which is an indefinite package
on account of the uninstantiated hole \m{C}.  Furthermore, let's suppose that
\pname{p} has already been independently typechecked, and its interface files
installed in some global location with $\alpha_A$ used as the module identity
of \m{A}. (To simplify this example, we'll assume $\beta_{AT}=\alpha_A$.)

The first three lines of \pname{q} type check in the normal way, but \m{D}
now poses a problem: if we load the interface file for \m{B} the normal way,
we will get a reference to type \texttt{T} with the original name $\alpha_A$.\texttt{T},
whereas from \m{A} we have an original name \pname{q}:\m{A}.\texttt{T}.

Let's suppose that we already have the result of a shaping pass, which
maps our identity variables to their true identities.
Let's consider the possible options here:

\begin{itemize}
    \item We could re-typecheck \pname{p}, feeding it the correct instantiations
        for its variables.  However, this seems wasteful: we typechecked the
        package already, and up-to-renaming, the interface files are exactly
        what we need to type check our application.
    \item We could make copies of all the interface files, renamed to have the
        right original names.  This also seems wasteful: why should we have to
        create a new copy of every interface file in a library we depend on?
    \item When \emph{reading in} the interface file to GHC, we could apply the
        renaming according to the shaping pass and store that in memory.
\end{itemize}

That last solution is pretty appealing, however, there are still circumstances
we need to create new interface files; these exactly mirror the cases described
in Section~\ref{sec:compiling}.

\subsection{Incremental typechecking}
We want to typecheck modules incrementally, i.e., when something changes in
a package, we only want to re-typecheck the modules that care about that
change. GHC already does this today.%
\footnote{\url{https://ghc.haskell.org/trac/ghc/wiki/Commentary/Compiler/RecompilationAvoidance}}
Is the same mechanism sufficient for Backpack? Edward and Scott think that it
is, mostly. Our conjecture is that a module should be re-typechecked if the
existing mechanism says it should \emph{or} if the logical shape
context (which maps logical names to physical names) has changed. The latter
condition is due to aliases that affect typechecking of modules.

Let's look again at an example from before:
\begin{verbatim}
package p where
    A :: [ data A ]
    B :: [ data A ]
    M = [ import A; import B; ... ]
\end{verbatim}
Let's say that \verb|M| is typechecked successfully. Now we add an alias binding
at the end of the package, \verb|A = B|. Does \verb|M| need to be
re-typechecked? Yes! (Well, it seems so, but let's just assert ``yes'' for now.
Certainly in the reverse case---if we remove the alias and then ask---this
is true, since \verb|M| might have depended on the two \verb|A| types
being the same.)
The logical shape context changed to say that \verb|A| and
\verb|B| now map to the same physical module identity. But does the existing
recompilation avoidance mechanism say that \verb|M| should be re-typechecked?
It's unclear. The \verb|.hi| file for \verb|M| records that it imported \verb|A| and
\verb|B| with particular ABIs, but does it also know about the physical module
identities (or rather, original module names) of these modules?

Scott thinks this highlights the need for us to get our story straight about
the connection between logical names, physical module identities, and file
names!


\subsection{Installing indefinite packages}\label{sec:installing-indefinite}

If an indefinite package contains no code at all, we only need
to install the interface file for the signatures.  However, if
they include code, we must provide all of the
ingredients necessary to compile them when the holes are linked against
actual implementations.  (Figure~\ref{fig:pkgdb})

\paragraph{Source tarball or preprocessed source?}  What is the representation of the source that is saved is.  There
are a number of possible choices:

\begin{itemize}
    \item The original tarballs downloaded from Hackage,
    \item Preprocessed source files,
    \item Some sort of internal, type-checked representation of Haskell code (maybe the output of the desugarer).
\end{itemize}

Storing the tarballs is the simplest and most straightforward mechanism,
but we will have to be very certain that we can recompile the module
later in precisely the same we compiled it originally, to ensure the hi
files match up (fortunately, it should be simple to perform an optional
sanity check before proceeding.) The appeal of saving preprocessed
source, or even the IRs, is that this is conceptually this is exactly
what an indefinite package is: we have paused the compilation process
partway, intending to finish it later.  However, our compilation strategy
for definite packages requires us to run this step using a \emph{different}
choice of original names, so it's unclear how much work could actually be reused.

\paragraph{Sources in sandboxes}  Another nice way to implement indefinite
packages is to register them as source packages in a Cabal sandbox, and then
teach Cabal how to build them multiple times in the compile process.  Perhaps
the global package database should be extended with a directory of source
packages in order to support indefinite packages.

\section{Surface syntax}

In the Backpack paper, a brand new module language is presented, with
syntax for inline modules and signatures.  This syntax is probably worth implementing,
because it makes it easy to specify compatibility packages, whose module
definitions in general may be very short:

\begin{verbatim}
package ishake-0.12-shake-0.13 where
    include shake-0.13
    Development.Shake.Sys = Development.Shake.Cmd
    Development.Shake = [ (**>) = (&>) ; (*>>) = (|*>)]
    Development.Shake.Rule = [ defaultPriority = rule . priority 0.5 ]
    include ishake-0.12
\end{verbatim}

However, there are a few things that are less than ideal about the
surface syntax proposed by Paper Backpack:

\begin{itemize}
    \item It's completely different from the current method users
        specify packages. There's nothing wrong with this per se
        (one simply needs to support both formats) but the smaller
        the delta, the easier the new packaging format is to explain
        and implement.

    \item Sometimes order matters (relative ordering of signatures and
        module implementations), and other times it does not (aliases).
        This can be confusing for users.

    \item Users have to order module definitions topologically,
        whereas in current Cabal modules can be listed in any order, and
        GHC figures out an appropriate order to compile them.
\end{itemize}

Here is an alternative proposal, closely based on Cabal syntax.  Given
the following Backpack definition:

\begin{verbatim}
package libfoo(A, B, C, Foo) where
    include base
    -- renaming and thinning
    include libfoo (Foo, Bar as Baz)
    -- holes
    A :: [ a :: Bool ].hsig
    A2 :: [ b :: Bool ].hsig
    -- normal module
    B = [
        import {-# SOURCE #-} A
        import Foo
        import Baz
        ...
    ].hs
    -- recursively linked pair of modules, one is private
    C :: [ data C ].hsig
    D = [ import {-# SOURCE #-} C; data D = D C ].hs
    C = [ import D; data C = C D ].hs
    -- alias
    A = A2
\end{verbatim}

We can write the following Cabal-like syntax instead (where
all of the signatures and modules are placed in appropriately
named files):

\begin{verbatim}
package: libfoo
...
build-depends: base, libfoo (Foo, Bar as Baz)
required-signatures: A A2 -- deferred for now
exposed-modules: Foo B C
aliases: A = A2
other-modules: D
\end{verbatim}

Notably, all of these lists are \emph{insensitive} to ordering!
The key idea is use of the \verb|{-# SOURCE #-}| pragma, which
is enough to solve the important ordering constraint between
signatures and modules.

Here is how the elaboration works.  For simplicity, in this algorithm
description, we assume all packages being compiled have no holes
(including \verb|build-depends| packages). Later, we'll discuss how to
extend the algorithm to handle holes in both subpackages and the main
package itself.

\begin{enumerate}

    \item At the top-level with \verb|package| $p$ and
        \verb|exposed-modules| $ms$, record \verb|package p (ms) where|

    \item For each package $p$ with thinning/renaming $ms$ in
        \verb|build-depends|, record a \verb|include p (ms)| in the
        Backpack package.  The ordering of these includes does not
        matter, since none of these packages have holes.

    \item Take all modules $m$ in \verb|other-modules| and
        \verb|exposed-modules| which were not exported by build
        dependencies, and create a directed graph where hs and hs-boot
        files are nodes and imports are edges (the target of an edge is
        an hs file if it is a normal import, and an hs-boot file if it
        is a SOURCE import).  Topologically sort this graph, erroring if
        this graph contains cycles (even with recursive modules, the
        cycle should have been broken by an hs-boot file).  For each
        node, in this order, record \verb|M = [ ... ]| or \verb|M :: [ ... ]|
        depending on whether or not it is an hs or hs-boot.  If possible,
        sort signatures before implementations when there is no constraint
        otherwise.

\end{enumerate}

Here is a simple example which shows how SOURCE can be used to disambiguate
between two important cases. Suppose we have these modules:

\begin{verbatim}
-- A1.hs
import {-# SOURCE #-} B

-- A2.hs
import B

-- B.hs
x = True

-- B.hs-boot
x :: Bool
\end{verbatim}

Then we translate the following packages as follows:

\begin{verbatim}
exposed-modules: A1 B
-- translates to
B :: [ x :: Bool ]
A1 = [ import B ]
B = [ x = True ]
\end{verbatim}

but

\begin{verbatim}
exposed-modules: A2 B
-- translates to
B = [ x = True ]
B :: [ x :: Bool ]
A2 = [ import B ]
\end{verbatim}

The import controls placement between signature and module, and in A1 it
forces B's signature to be sorted before B's implementation (whereas in
the second section, there is no constraint so we preferentially place
the B's implementation first)

\paragraph{Holes in the database} In the presence of holes,
\verb|build-depends| resolution becomes more complicated.  First,
let's consider the case where the package we are building is
definite, but the package database contains indefinite packages with holes.
In order to maintain the linking restriction, we now have to order packages
from step (2) of the previous elaboration.  We can do this by creating
a directed graph, where nodes are packages and edges are from holes to the
package which implements them.  If there is a cycle, this indicates a mutually
recursive package.  In the absence of cycles, a topological sorting of this
graph preserves the linking invariant.

One subtlety to consider is the fact that an entry in \verb|build-depends|
can affect how a hole is instantiated by another entry.  This might be a
bit weird to users, who might like to explicitly say how holes are
filled when instantiating a package.  Food for thought, surface syntax wise.

\paragraph{Holes in the package} Actually, this is quite simple: the
ordering of includes goes as before, but some indefinite packages in the
database are less constrained as they're ``dependencies'' are fulfilled
by the holes at the top-level of this package.  It's also worth noting
that some dependencies will go unresolved, since the following package
is valid:

\begin{verbatim}
package a where
    A :: ...
package b where
    include a
\end{verbatim}

\paragraph{Multiple signatures}  In Backpack syntax, it's possible to
define a signature multiple times, which is necessary for mutually
recursive signatures:

\begin{verbatim}
package a where
    A :: [ data A ]
    B :: [ import A; data B = B A ]
    A :: [ import B; data A = A B ]
\end{verbatim}

Critically, notice that we can see the constructors for both module B and A
after the signatures are linked together.  This is not possible in GHC
today, but could be possible by permitting multiple hs-boot files.  Now
the SOURCE pragma indicating an import must \emph{disambiguate} which
hs-boot file it intends to include.  This might be one way of doing it:

\begin{verbatim}
-- A.hs-boot2
data A

-- B.hs-boot
import {-# SOURCE hs-boot2 #-} A

-- A.hs-boot
import {-# SOURCE hs-boot #-} B
\end{verbatim}

\paragraph{Explicit or implicit reexports}  One annoying property of
this proposal is that, looking at the \verb|exposed-modules| list, it is
not immediately clear what source files one would expect to find in the
current package.  It's not obvious what the proper way to go about doing
this is.

\paragraph{Better syntax for SOURCE}  If we enshrine the SOURCE import
as a way of solving Backpack ordering problems, it would be nice to have
some better syntax for it. One possibility is:

\begin{verbatim}
abstract import Data.Foo
\end{verbatim}

which makes it clear that this module is pluggable, typechecking against
a signature.  Note that this only indicates how type checking should be
done: when actually compiling the module we will compile against the
interface file for the true implementation of the module.

It's worth noting that the SOURCE annotation was originally made a
pragma because, in principle, it should have been possible to compile
some recursive modules without needing the hs-boot file at all. But if
we're moving towards boot files as signatures, this concern is less
relevant.

\section{Type classes and type families}

\subsection{Background}

Before we talk about how to support type classes in Backpack, it's first
worth talking about what we are trying to achieve in the design.  Most
would agree that \emph{type safety} is the cardinal law that should be
preserved (in the sense that segfaults should not be possible), but
there are many instances of ``bad behavior'' (top level mutable state,
weakening of abstraction guarantees, ambiguous instance resolution, etc)
which various Haskellers may disagree on the necessity of ruling out.

With this in mind, it is worth summarizing what kind of guarantees are
presently given by GHC with regards to type classes and type families,
as well as characterizing the \emph{cultural} expectations of the
Haskell community.

\paragraph{Type classes}  When discussing type class systems, there are
several properties that one may talk about.
A set of instances is \emph{confluent} if, no matter what order
constraint solving is performed, GHC will terminate with a canonical set
of constraints that must be satisfied for any given use of a type class.
In other words, confluence says that we won't conclude that a program
doesn't type check just because we swapped in a different constraint
solving algorithm.

Confluence's closely related twin is \emph{coherence} (defined in ``Type
classes: exploring the design space''). This property states that
``every different valid typing derivation of a program leads to a
resulting program that has the same dynamic semantics.''  Why could
differing typing derivations result in different dynamic semantics?  The
answer is that context reduction, which picks out type class instances,
elaborates into concrete choices of dictionaries in the generated code.
Confluence is a prerequisite for coherence, since one
can hardly talk about the dynamic semantics of a program that doesn't
type check.

In the vernacular, confluence and coherence are often incorrectly used
to refer to another related property: \emph{global uniqueness of instances},
which states that in a fully compiled program, for any type, there is at most one
instance resolution for a given type class.  Languages with local type
class instances such as Scala generally do not have this property, and
this assumption is frequently used for abstraction.

So, what properties does GHC enforce, in practice?
In the absence of any type system extensions, GHC's employs a set of
rules (described in ``Exploring the design space'') to ensure that type
class resolution is confluent and coherent.  Intuitively, it achieves
this by having a very simple constraint solving algorithm (generate
wanted constraints and solve wanted constraints) and then requiring the
set of instances to be \emph{nonoverlapping}, ensuring there is only
ever one way to solve a wanted constraint.  Overlap is a
more stringent restriction than either confluence or coherence, and
via the \verb|OverlappingInstances| and \verb|IncoherentInstances|, GHC
allows a user to relax this restriction ``if they know what they're doing.''

Surprisingly, however, GHC does \emph{not} enforce global uniqueness of
instances.  Imported instances are not checked for overlap until we
attempt to use them for instance resolution.  Consider the following program:

\begin{verbatim}
-- T.hs
data T = T
-- A.hs
import T
instance Eq T where
-- B.hs
import T
instance Eq T where
-- C.hs
import A
import B
\end{verbatim}

When compiled with one-shot compilation, \verb|C| will not report
overlapping instances unless we actually attempt to use the \verb|Eq|
instance in C.\footnote{When using batch compilation, GHC reuses the
    instance database and is actually able to detect the duplicated
    instance when compiling B.  But if you run it again, recompilation
avoidance skips A, and it finishes compiling!  See this bug:
\url{https://ghc.haskell.org/trac/ghc/ticket/5316}}  This is by
design\footnote{\url{https://ghc.haskell.org/trac/ghc/ticket/2356}}:
ensuring that there are no overlapping instances eagerly requires
eagerly reading all the interface files a module may depend on.

We might summarize these three properties in the following manner.
Culturally, the Haskell community expects \emph{global uniqueness of instances}
to hold: the implicit global database of instances should be
confluent and coherent.  GHC, however, does not enforce uniqueness of
instances: instead, it merely guarantees that the \emph{subset} of the
instance database it uses when it compiles any given module is confluent and coherent.  GHC does do some
tests when an instance is declared to see if it would result in overlap
with visible instances, but the check is by no means
perfect\footnote{\url{https://ghc.haskell.org/trac/ghc/ticket/9288}};
truly, \emph{type-class constraint resolution} has the final word.  One
mitigating factor is that in the absence of \emph{orphan instances}, GHC is
guaranteed to eagerly notice when the instance database has overlap.\footnote{Assuming that the instance declaration checks actually worked\ldots}

Clearly, the fact that GHC's lazy behavior is surprising to most
Haskellers means that the lazy check is mostly good enough: a user
is likely to discover overlapping instances one way or another.
However, it is relatively simple to construct example programs which
violate global uniqueness of instances in an observable way:

\begin{verbatim}
-- A.hs
module A where
data U = X | Y deriving (Eq, Show)

-- B.hs
module B where
import Data.Set
import A

instance Ord U where
compare X X = EQ
compare X Y = LT
compare Y X = GT
compare Y Y = EQ

ins :: U -> Set U -> Set U
ins = insert

-- C.hs
module C where
import Data.Set
import A

instance Ord U where
compare X X = EQ
compare X Y = GT
compare Y X = LT
compare Y Y = EQ

ins' :: U -> Set U -> Set U
ins' = insert

-- D.hs
module Main where
import Data.Set
import A
import B
import C

test :: Set U
test = ins' X $ ins X $ ins Y $ empty

main :: IO ()
main = print test

-- OUTPUT
$ ghc -Wall -XSafe -fforce-recomp --make D.hs
[1 of 4] Compiling A ( A.hs, A.o )
[2 of 4] Compiling B ( B.hs, B.o )

B.hs:5:10: Warning: Orphan instance: instance [safe] Ord U
[3 of 4] Compiling C ( C.hs, C.o )

C.hs:5:10: Warning: Orphan instance: instance [safe] Ord U
[4 of 4] Compiling Main ( D.hs, D.o )
Linking D ...
$ ./D
fromList [X,Y,X]
\end{verbatim}

Locally, all type class resolution was coherent: in the subset of
instances each module had visible, type class resolution could be done
unambiguously.  Furthermore, the types of \verb|ins| and \verb|ins'|
discharge type class resolution, so that in \verb|D| when the database
is now overlapping, no resolution occurs, so the error is never found.

It is easy to dismiss this example as an implementation wart in GHC, and
continue pretending that global uniqueness of instances holds.  However,
the problem with \emph{global uniqueness of instances} is that they are
inherently nonmodular: you might find yourself unable to compose two
components because they accidentally defined the same type class
instance, even though these instances are plumbed deep in the
implementation details of the components.

As it turns out, there is already another feature in Haskell which
must enforce global uniqueness, to prevent segfaults.
We now turn to type classes' close cousin: type families.

\paragraph{Type families}  With type families, confluence is the primary
property of interest.  (Coherence is not of much interest because type
families are elaborated into coercions, which don't have any
computational content.)  Rather than considering what the set of
constraints we reduce to, confluence for type families considers the
reduction of type families.  The overlap checks for type families
can be quite sophisticated, especially in the case of closed type
families.

Unlike type classes, however, GHC \emph{does} check the non-overlap
of type families eagerly.  The analogous program does \emph{not} type check:

\begin{verbatim}
-- F.hs
type family F a :: *
-- A.hs
import F
type instance F Bool = Int
-- B.hs
import F
type instance F Bool = Bool
-- C.hs
import A
import B
\end{verbatim}

The reason is that it is \emph{unsound} to ever allow any overlap
(unlike in the case of type classes where it just leads to incoherence.)
Thus, whereas one might imagine dropping the global uniqueness of instances
invariant for type classes, it is absolutely necessary to perform global
enforcement here.  There's no way around it!

\subsection{Local type classes}

Here, we say \textbf{NO} to global uniqueness.

This design is perhaps best discussed in relation to modular type
classes, which shares many similar properties.  Instances are now
treated as first class objects (in MTCs, they are simply modules)---we
may explicitly hide or include instances for type class resolution (in
MTCs, this is done via the \verb|using| top-level declaration).  This is
essentially what was sketched in Section 5 of the original Backpack
paper.  As a simple example:

\begin{verbatim}
package p where
    A :: [ data T = T ]
    B = [ import A; instance Eq T where ... ]

package q where
    A = [ data T = T; instance Eq T where ... ]
    include p
\end{verbatim}

Here, \verb|B| does not see the extra instance declared by \verb|A|,
because it was thinned from its signature of \verb|A| (and thus never
declared canonical.)  To declare an instance without making it
canonical, it must placed in a separate (unimported) module.

Like modular type classes, Backpack does not give rise to incoherence,
because instance visibility can only be changed at the top level module
language, where it is already considered best practice to provide
explicit signatures.  Here is the example used in the Modular Type
Classes paper to demonstrate the problem:

\begin{verbatim}
structure A = using EqInt1 in
    struct ...fun f x = eq(x,x)... end
structure B = using EqInt2 in
    struct ...val y = A.f(3)... end
\end{verbatim}

Is the type of f \verb|int -> bool|, or does it have a type-class
constraint?  Because type checking proceeds over the entire program, ML
could hypothetically pick either.  However, ported to Haskell, the
example looks like this:

\begin{verbatim}
EqInt1 :: [ instance Eq Int ]
EqInt2 :: [ instance Eq Int ]
A = [
    import EqInt1
    f x = x == x
]
B = [
    import EqInt2
    import A hiding (instance Eq Int)
    y = f 3
]
\end{verbatim}

There may be ambiguity, yes, but it can be easily resolved by the
addition of a top-level type signature to \verb|f|, which is considered
best-practice anyway.  Additionally, Haskell users are trained to expect
a particular inference for \verb|f| in any case (the polymorphic one).

Here is another example which might be considered surprising:

\begin{verbatim}
package p where
    A :: [ data T = T ]
    B :: [ data T = T ]
    C = [
        import qualified A
        import qualified B
        instance Show A.T where show T = "A"
        instance Show B.T where show T = "B"
        x :: String
        x = show A.T ++ show B.T
    ]
\end{verbatim}

In the original Backpack paper, it was implied that module \verb|C|
should not type check if \verb|A.T = B.T| (failing at link time).
However, if we set aside, for a moment, the issue that anyone who
imports \verb|C| in such a context will now have overlapping instances,
there is no reason in principle why the module itself should be
problematic.  Here is the example in MTCs, which I have good word from
Derek does type check.

\begin{verbatim}
signature SIG = sig
    type t
    val mk : t
end
signature SHOW = sig
    type t
    val show : t -> string
end
functor Example (A : SIG) (B : SIG) =
    let structure ShowA : SHOW = struct
        type t = A.t
        fun show _ = "A"
    end in
    let structure ShowB : SHOW = struct
        type t = B.t
        fun show _ = "B"
    end in
    using ShowA, ShowB in
    struct
        val x = show A.mk ++ show B.mk
    end : sig val x : string end
\end{verbatim}

The moral of the story is, even though in a later context the instances
are overlapping, inside the functor, the type-class resolution is unambiguous
and should be done (so \verb|x = "AB"|).

Up until this point, we've argued why MTCs and this Backpack design are similar.
However, there is an important sociological difference between modular type-classes
and this proposed scheme for Backpack.  In the presentation ``Why Applicative
Functors Matter'', Derek mentions the canonical example of defining a set:

\begin{verbatim}
signature ORD = sig type t; val cmp : t -> t -> bool end
signature SET = sig type t; type elem;
    val empty : t;
    val insert : elem -> t -> t ...
end
functor MkSet (X : ORD) :> SET where type elem = X.t
  = struct ... end
\end{verbatim}

This is actually very different from how sets tend to be defined in
Haskell today.  If we directly encoded this in Backpack, it would
look like this:

\begin{verbatim}
package mk-set where
    X :: [
        data T
        cmp :: T -> T-> Bool
    ]
    Set :: [
        data Set
        empty :: Set
        insert :: T -> Set -> Set
    ]
    Set = [
        import X
        ...
    ]
\end{verbatim}

It's also informative to consider how MTCs would encode set as it is written
today in Haskell:

\begin{verbatim}
signature ORD = sig type t; val cmp : t -> t -> bool end
signature SET = sig type 'a t;
    val empty : 'a t;
    val insert : (X : ORD) => X.t -> X.t t -> X.t t
end
struct MkSet :> SET = struct ... end
\end{verbatim}

Here, it is clear to see that while functor instantiation occurs for
implementation, it is not occuring for types.  This is a big limitation
with the Haskell approach, and it explains why Haskellers, in practice,
find global uniqueness of instances so desirable.

Implementation-wise, this requires some subtle modifications to how we
do type class resolution.  Type checking of indefinite modules works as
before, but when go to actually compile them against explicit
implementations, we need to ``forget'' that two types are equal when
doing instance resolution.  This could probably be implemented by
associating type class instances with the original name that was
utilized when typechecking, so that we can resolve ambiguous matches
against types which have the same original name now that we are
compiling.

As we've mentioned previously, this strategy is unsound for type families.

\subsection{Globally unique}

Here, we say \textbf{YES} to global uniqueness.

When we require the global uniqueness of instances (either because
that's the type class design we chose, or because we're considering
the problem of type families), we will need to reject declarations like the
one cited above when \verb|A.T = B.T|:

\begin{verbatim}
A :: [ data T ]
B :: [ data T ]
C :: [
    import qualified A
    import qualified B
    instance Show A.T where show T = "A"
    instance Show B.T where show T = "B"
]
\end{verbatim}

The paper mentions that a link-time check is sufficient to prevent this
case from arising.  While in the previous section, we've argued why this
is actually unnecessary when local instances are allowed, the link-time
check is a good match in the case of global instances, because any
instance \emph{must} be declared in the signature.  The scheme proceeds
as follows: when some instances are typechecked initially, we type check
them as if all of variable module identities were distinct.  Then, when
we perform linking (we \verb|include| or we unify some module
identities), we check again if to see if we've discovered some instance
overlap.  This linking check is akin to the eager check that is
performed today for type families; it would need to be implemented for
type classes as well: however, there is a twist: we are \emph{redoing}
the overlap check now that some identities have been unified.

As an implementation trick, one could deferring the check until \verb|C|
is compiled, keeping in line with GHC's lazy ``don't check for overlap
until the use site.''  (Once again, unsound for type families.)

\paragraph{What about module inequalities?}  An older proposal was for
signatures to contain ``module inequalities'', i.e., assertions that two
modules are not equal.  (Technically: we need to be able to apply this
assertion to $\beta$ module variables, since \verb|A != B| while
\verb|A.T = B.T|).  Currently, Edward thinks that module inequalities
are only marginal utility with local instances (i.e., not enough to
justify the implementation cost) and not useful at all in the world of
global instances!

With local instances, module inequalities could be useful to statically
rule out examples like \verb|show A.T ++ show B.T|.  Because such uses
are not necessarily reflected in the signature, it would be a violation
of separate module development to try to divine the constraint from the
implementation itself.  I claim this is of limited utility, however, because,
as we mentioned earlier, we can compile these ``incoherent'' modules perfectly
coherently.  With global instances, all instances must be in the signature, so
while it might be aesthetically displeasing to have the signature impose
extra restrictions on linking identities, we can carry this out without
violating the linking restriction.

\subsection{Orphans}

Controlling instance visibility via signature problems poses an implementation
challenge similar to that of orphan instances.  To describe this problem,
we first have to describe how instance resolution works for orphans and
non-orphans in GHC today.

Type information for already compiled code in other packages is cached
on disk using interface files.  For efficiency reasons, it's desirable to
avoid loading interface file until absolutely necessary: if we don't
use any of the identifiers for a file, it should not be necessary to
load the interface.  Among other things, type class instances are stored
in interface files.

Signatures and hs-boot files notwithstanding, non-orphan instance
resolution is achieved through a (somewhat) astonishing coincidence: at
the point when a type class is resolved, we are guaranteed to have
loaded the interfaces for all of the names involved in the type class
instantiation.  This means that if there is a type class, we will have
seen it; conversely, it means that non-orphan instances are a closed
world: if we load all these interfaces and see no non-orphan instance,
we know there never be a non-orphan instance.

Things are a bit worse for orphans: these instances are an open world,
and so the only way to tell if an orphan instance is in scope is by consulting
the transitive closure of module imports.

\section{Bits and bobs}

\subsection{Abstract type synonyms}

In Paper Backpack, abstract type synonyms are not permitted, because GHC doesn't
understand how to deal with them.  The purpose of this section is to describe
one particularly nastiness of abstract type synonyms, by way of the occurs check:

\begin{verbatim}
A :: [ type T ]
B :: [ import qualified A; type T = [A.T] ]
\end{verbatim}

At this point, it is illegal for \verb|A = B|, otherwise this type synonym would
fail the occurs check.  This seems like pretty bad news, since every instance
of the occurs check in the type-checker could constitute a module inequality.

\section{Open questions}\label{sec:open-questions}

Here are open problems about the implementation which still require
hashing out.

\begin{itemize}

    \item In Section~\ref{sec:simplifying-backpack}, we argued that we
        could implement Backpack without needing a shaping pass. We're
        pretty certain that this will work for typechecking and
        compiling fully definite packages with no recursive linking, but
        in Section~\ref{sec:typechecking-indefinite}, we described some
        of the prevailing difficulties of supporting signature linking.
        Renaming is not an insurmountable problem, but backwards flow of
        shaping information can be, and it is unclear how best to
        accommodate this.  This is probably the most important problem
        to overcome.

    \item In Section~\ref{sec:installing-indefinite}, a few choices for how to
        store source code were pitched, however, there is not consensus on which
        one is best.

    \item What is the impact of the multiplicity of PackageIds on
        dependency solving in Cabal?  Old questions of what to prefer
        when multiple package-versions are available (Cabal originally
        only needed to solve this between different versions of the same
        package, preferring the oldest version), but with signatures,
        there are more choices.  Should there be a complex solver that
        does all signature solving, or a preprocessing step that puts
        things back into the original Cabal version.  Authors may want
        to suggest policy for what packages should actually link against
        signatures (so a crypto library doesn't accidentally link
        against a null cipher package).

\end{itemize}

\end{document}